@ARTICLE{Barsalou1999, Author = {Lawrence W. Barsalou}, Journal = {Behavioral and Brain Sciences}, Pages = {577-609}, Title = {Perceptual symbol systems }, Volume = {22}, Year = {1999}, Abstract = {Prior to the twentieth century, theories of knowledge were inherently perceptual. Since then, developments in logic, statistics, and programming languages have inspired amodal theories that rest on principles fundamentally different from those underlying perception. In addition, perceptual approaches have become widely viewed as untenable because they are assumed to implement recording systems, not conceptual systems. A perceptual theory of knowledge is developed here in the context of current cognitive science and neuroscience. During perceptual experience, association areas in the brain capture bottom-up patterns of activation in sensory-motor areas. Later, in a top-down manner, association areas partially reactivate sensory-motor areas to implement perceptual symbols. The storage and reactivation of perceptual symbols operates at the level of perceptual components – not at the level of holistic perceptual experiences. Through the use of selective attention, schematic representations of perceptual components are extracted from experience and stored in memory (e.g., individual memories of green, purr, hot). As memories of the same component become organized around a common frame, they implement a simulator that produces limitless simulations of the component (e.g., simulations of purr). Not only do such simulators develop for aspects of sensory experience, they also develop for aspects of proprioception (e.g., lift, run) and introspection (e.g., compare, memory, happy, hungry). Once established, these simulators implement a basic conceptual system that represents types, supports categorization, and produces categorical inferences. These simulators further support productivity, propositions, and abstract concepts, thereby implementing a fully functional conceptual system. Productivity results from integrating simulators combinatorially and recursively to produce complex simulations. Propositions result from binding simulators to perceived individuals to represent type-token relations. Abstract concepts are grounded in complex simulations of combined physical and introspective events. Thus, a perceptual theory of knowledge can implement a fully functional conceptual system while avoiding problems associated with amodal symbol systems. Implications for cognition, neuroscience, evolution, development, and artificial intelligence are explored.}, URL = {www.service.emory.edu/~barsalou/ }, } @ARTICLE{Glenberg2000, Author = {Arthur M. Glenberg and David A. Robertson}, Journal = {Journal of Memory and Language}, Number = {3}, Pages = {379-401}, Title = {Symbol Grounding and Meaning: A Comparison of High-Dimensional and Embodied Theories of Meaning}, Volume = {43}, Year = {2000}, Abstract = {Latent Semantic Analysis (Landauer & Dumais, 1997) and Hyperspace Analogue to Language (Burgess & Lund, 1997) model meaning as the relations among abstract symbols that are arbitrarily related to what they signify. These symbols are ungrounded in that they are not tied to perceptual experience or action. Because the symbols are ungrounded, they cannot, in principle, capture the meaning of novel situations. In contrast, participants in three experiments found it trivially easy to discriminate between descriptions of sensible novel situations (e.g., using a newspaper to protect one’s face from the wind) and nonsense novel situations (e.g., using a matchbook to protect one’s face from the wind). These results support the Indexical Hypothesis that the meaning of a sentence is constructed by (a) indexing words and phrases to real objects or perceptual, analog symbols; (b) deriving affordances from the objects and symbols; and (c) meshing the affordances under the guidance of syntax.}, Keywords = {meaning; language; embodiment; computational models; Latent Semantic Analysis; Hyperspace Analogue to Language}, URL = {doi:10.1007/978-3-540-88682-2_3}, } @BOOK{Abhinav2008, Author = {Abhinav gupta and Larry S. Davis}, Edition = {Computer Vision – ECCV 2008}, Pages = {16-29 }, Publisher = {Springer Berlin / Heidelberg}, Title = {Beyond Nouns: Exploiting Prepositions and Comparative Adjectives for Learning Visual Classifiers}, Volume = {5302/2008}, Year = {2008}, Abstract = {Learning visual classifiers for object recognition from weakly labeled data requires determining correspondence between image regions and semantic object classes. Most approaches use co-occurrence of “nouns” and image features over large datasets to determine the correspondence, but many correspondence ambiguities remain. We further constrain the correspondence problem by exploiting additional language constructs to improve the learning process from weakly labeled data. We consider both “prepositions” and “comparative adjectives” which are used to express relationships between objects. If the models of such relationships can be determined, they help resolve correspondence ambiguities. However, learning models of these relationships requires solving the correspondence problem. We simultaneously learn the visual features defining “nouns” and the differential visual features defining such “binary-relationships” using an EM-based approach. }, URL = {doi:10.1007/978-3-540-88682-2_3 }, } @ARTICLE{Cangelosi2000, Author = { Angelo Cangelosi and Alberto Greco and Stevan Harnad}, Journal = {Connection Science}, Number = {2}, Pages = {143-162}, Title = {From Robotic Toil to Symbolic Theft: Grounding Transfer from Entry-Level to Higher-Level Categories}, Volume = {12}, Year = {2000}, Abstract = {Neural network models of categorical perception (compression of within-category similarity and dilation of between-category differences) are applied to the symbol-grounding problem (of how to connect symbols with meanings) by connecting analog sensorimotor projections to arbitrary symbolic representations via learned category-invariance detectors in a hybrid symbolic/nonsymbolic system. Our nets are trained to categorize and name 50x50 pixel images (e.g., circles, ellipses, squares and rectangles) projected onto the receptive field of a 7x7 retina. They first learn to do prototype matching and then entry-level naming for the four kinds of stimuli, grounding their names directly in the input patterns via hidden-unit representations ("sensorimotor toil"). We show that a higher-level categorization (e.g., "symmetric" vs. "asymmetric") can learned in two very different ways: either (1) directly from the input, just as with the entry-level categories (i.e., by toil), or (2) indirectly, from boolean combinations of the grounded category names in the form of propositions describing the higher-order category ("symbolic theft"). We analyze the architectures and input conditions that allow grounding (in the form of compression/separation in internal similarity space) to be "transferred" in this second way from directly grounded entry-level category names to higher-order category names. Such hybrid models have implications for the evolution and learning of language}, Keywords = {symbol grounding, categorical perception, neural networks, robotics, pattern recognition}, URL = { http://citeseerx.ist.psu.edu/viewdoc/summary?doi=?doi=10.1.1.45.9331}, } @INPROCEEDINGS{Galata2002, Author ={Aphrodite Galata, Anthony Cohn, Derek Magee and David Hogg}, Booktitle = {Proceedings of the European Conference on Artificial Intelligence (ECAI’02),} Title = { Modeling Interaction Using Learnt Qualitative Spatio-Temporal Relations and Variable Length Markov Models }, Year = { 2002}, Abstract = {Motivated by applications such as automated visual surveillance and video monitoring and annotation, there has been a lot of interest in constructing cognitive vision systems capable of interpreting the high level semantics of dynamic scenes. In this paper we present a novel approach for automatically inferring models of object interactions that can be used to interpret observed behaviour within a scene. A real-time low-level computer vision system, together with an attentional control mechanism, are used to identify incidents or events that occur in the scene. A data driven approach has been taken in order to automatically infer discrete and abstract representations (symbols) of primitive object interactions; effectively the system learns a set of qualitative spatial relations relevant to the dynamic behaviour of the domain. These symbols then form the alphabet of a VLMM which automatically infers the high level structure of typical interactive behaviour. The learnt behaviour model has generative capabilities and is also capable of recognizing typical or atypical activities within a scene. Experiments have been performed within the traffic monitoring domain; however the proposed method is applicable to the general automatic surveillance task since it does not assume a priori knowledge of a specific domain.}, URL = {http://www.cs.man.ac.uk/~agalata/publications/galataECAI02.pdf}, } @ARTICLE{Ivanov2000, Author = {Yuri A. Ivanov and Aaron F. Bobick}, Journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, Number = {8}, Pages = {852-872}, Title = {Recognition of Visual Activities and Interactions by Stochastic Parsing}, Volume = {22}, Year = {2000}, Abstract = {This paper describes a probabilistic syntactic approach to the detection and recognition of temporally extended activities and interactions between multiple agents. The fundamental idea is to divide the recognition problem into two levels. The lower level detections are performed using standard independent probabilistic event detectors to propose candidate detections of low- level features. The outputs of these detectors provide the input stream for a stochastic context-free grammar parsing mechanism. The grammar and parser provide longer range temporal constraints, disambiguate uncertain low-level detections, and allow the inclusion of a priori knowledge about the structure of temporal events in a given domain. To achieve such a system we: (1) provide techniques for generating a discrete symbol stream from continuous low-level detectors; (2) extend stochastic context-free parsing to handle uncertainty in the input symbol stream; (3) augment a run time parsing algorithm to enforce intersymbol constraints such as requiring temporal consistency between primitives; and (4) extend the consistency filtering to maintain consistent multiobject interaction. We develop a real-time system and demonstrate the approach in several experiments on gesture recognition and in video surveillance. In the suveillance application, we show how the system correctly interprets activities of multiple, interacting objects. }, Keywords = {Syntactic pattern recognition, action recognition , high level vision, video suveillance, gesture recognition, video monitoring}, URL = { http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.15.2541&rep=rep1&type=pdf}, } @CONFERENCE{Abhinav_storyline2009, Author = {Abhinav Gupta,Praveen Srinivasan,Jianbo Shi,Larry S. Davis}, Journal = {IEEE Conference on Computer Vision and Pattern Recognition,2009} Title = {Understanding Videos, Constructing Plots Learning a Visually Grounded Storyline Model from Annotated Videos}, Year = {2009}, Abstract = { Analyzing videos of human activities involves not only recognizing actions (typically based on their appearances), but also determining the story/plot of the video. The storyline of a video describes causal relationships between actions. Beyond recognition of individual actions, discovering causal relationships helps to better understand the semantic meaning of the activities. We present an approach to learn a visually grounded storyline model of videos directly from weakly labeled data. The storyline model is represented as an AND-OR graph, a structure that can compactly encode storyline variation across videos. The edges in the AND-OR graph correspond to causal relationships which are represented in terms of spatio-temporal constraints. We formulate an Integer Programming framework for action recognition and storyline extraction using the storyline model and visual groundings learned from training data. }, URL = { http://www.cs.cmu.edu/~abhinavg/papers/cvpr_2009.pdf}, } @INPROCEEDINGS{Fengjun_2004, Author = {Fengjun Lv, Jinman Kang, Ram Nevatia, Isaac Cohen and G´erard Medioni}, Booktitle = {6th IEEE International Workshop on Performance Evaluation of Tracking and Surveillance} Title = {Automatic Tracking and Labeling of Human Activities in a Video Sequence}, Year = { 2004}, Abstract = {This paper presents a novel approach for tracking multiple objects and a statistical learning approach for detection of human activities in a video sequence. For the tracking, a 2D rigid transformation invariant appearance model combining color and edge information of the detected blob is proposed. For the activity detection, each activity label is regarded as a hypothesis. Given some labeled sequences, a group of features are first extracted from motion trajectories of each detected object and the likelihood of each feature under that hypothesis is calculated. A dynamic programming-based training algorithm is applied to get an optimal classifier for each feature. Then it selects the classifiers with the most discriminative power and combines them to form a stronger classifier. This algorithm complies with Neyman-Pearson criterion so that it is guaranteed to achieve a specified detection rate as well as a minimized false alarm rate. Results on PETS'04 dataset show the effectiveness of the proposed algorithm.}, URL = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.78.3723&rep=rep1&type=pdf}, } @INPROCEEDINGS{Mutch_2006, Author = {J. Mutch and D. Lowe}, Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition} Title = {Multiclass Object Recognition with Sparse, Localized Features}, Year = { 2006}, Abstract = {This paper develops a biologically inspired model of visual object recognition to the multiclass object recogniton problem. The model modifies that of Serre,Wolf and Poggio. The model consist of five layers : an intial image layer followed by four subsequent layers, each built from previous by alternate template matching and max pooling operations. Image Layer:Image is converted to grayscale. Shorter edge is scaled to 140 pixels while maintaining the aspect ratio. An image pyramid of 10 scale is formed. Gabor Filter(S1 layer): Gabor filter(11X11) is applied in four different orientations at each point of every scale. A 4D structure is thus obtained - a 3D pyramid with four different orientaion at each point. Local Invariance(C1) layer: This layer pools nearby S1 units (of same orientation) to create postion and scale invariance over large local regions. For each orientation, the S1 pyramid is convolved with a 3D max filter(10X10)in position and 2 units in depth. To achieve subsampling, the max filter is moved around the S1 pyramid in steps of 5 position(but only one in scale),giving a samplig overlap of two units both in position and scale. Intermediate(S2) layer: At every postion and scale in the C1 layer, we perform template matches between the patch of C1 units centred at that position/scale and each of 'd' prototype patches. Prototype patches represents the intermediate-level features of the model. Prototypes themselves are sampled randomly from the C1 layers of training images.These prototype patches are like fuzzy templates, consisting of a grid of simpler features that are all slightly position and scale variant.Each of these prototypes can be seen as just another convolution filter which is run over C1. An S2 pyramid is thus generated which has nearly same no. of positions and scales as C1 pyramid but having d types of units at each position/scale. Global Invariance(C2) layer: Finally a d-dimensional vector is obtained each of whose entry is the maximum response of the image(over all positions all scales) for one of the d prototypes patches. SVM classifier is used for the final classification. Improvement over this model included- 1.Instead of computing the responses for all the orientations of a position, they in the feature learning phase, for every n x n position, they store only the dominant orientation value in the patch. 2.For the S1 and C1 unit outputs, at each location they calculate the minimum and maximum responses, and if the response at that location is less than min + h(max-min), then they set it to zero, where h is the inhibition lvele, which is the fraction of the response range to be suppressed. 3.Instead of taking the maximum response to each S2 feature, at each location and scale, they look at some range of scales and positions to find the response to the S2 feature. 4.Selecting highly weighted features by SVM}, URL = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1640736}, } @CONFERENCE{Ju_Sun_2009 Author = {Ju Sun ; Xiao Wu ; Shuicheng Yan ; Loong-Fah Cheong ; Tat-Seng Chua ; Jintao Li ; }, Journal = {IEEE Conference on Computer Vision and Pattern Recognition,2009} Title = {Hierarchical spatio-temporal context modeling for action recognition}, Year = {2009}, Abstract = { This paper talks about action recognition on the basis of trajectories.SIFT based trajectories are extracted in the xyt space. These are then used to form three levels of context representations- 1. Point level context- Average of SIFT features extracted from the trajectory. 2. Intra-trajectory context- Displacement vectors on a trajectory are quantized(in terms of both magnitude and orientation)to a set of 25 vectors which form the state set for an ergodic markov chain.State transition probabilities and unique stationary distributions vectors are obtained. To represent all trajectories over a spatio-temporal volume in fixed length manner, a bag of words method is employed to build a histogram of trajectory occurence based on extracted Markov chain stationary distribution features. This encodes only the global statistic but misses more detailed information, such as relative position of features or local density of features,etc. 3.Inter-trajectory context- Suppose there are M classes by k means algo on the above. For each trajectory of class 'i' a geometric centre (x0,y0,t0) is found out. In a cubic volume around this the no. of occurences of trajectories of class'j' is found. The sum of these values for all trajectories belonging to class 'i' give P(i,j). The matrix P so obtained is again converted into a valid transition matrix for a Markov chain process and corresponding stationary distribution vector is then obtained. Combination of these descriptors and spatio-temporal grids bring out 68 feature channels in total. Best performance of a particular action category generally entails only a few of these many feature channels. Multiple kernel technique is used for classification. This mechanism provides great flexibility since each kernel can operate on different set of features and each feature can be associated with different type of kernels simultaneously. }, URL = { http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5206721}, } @CONFERENCE{Laptev_2008 Author = {I. Laptev, M. Marszaek, C. Schmid, and B. Rozenfeld }, Journal = {IEEE Conference on Computer Vision and Pattern Recognition,2008} Title = {Learning realistic human actions from movies}, Year = {2008}, Abstract = { This paper looks at learning action from movies with help of scripts. Script and subtitles are used to obtain annotations for actions in the video. To cope with the variability of text describing human actions,each scene description is represented as a sparse vector in a high-dimensional feature space. Words, adjacent pairs of words, and non-adjacent pairs of words occurring within a small window of N words are used as features, where N varies between 2 and 8. Features supported by less than three training documents are removed. For the classification a regularized perceptron is used. The classifier is trained on a manually labeled set of scene descriptions. Interest points are detected using a space-time extension of the Harris operator. Histogram descriptors of space-time volumes in the neighborhood of detected points are computed. The size of volume is related to detection scale. Each volume is subdivided into a grid of cuboids; for each cuboid, a coarse histograms of oriented gradient (HoG) and optic flow(HoF) is computed. Normalized histograms are concatenated into HoG and HoF descriptor vectors and are similar in spirit to the well known SIFT descriptor. Clustering a subset of 100k features sampled from the training videos is done with the k-means algorithm to obtain the visual vocabulary. The number of clusters k is set to 4000. The BoF representation then assigns each feature to the closest (we use Euclidean distance) vocabulary word and computes the histogram of visual word occurrences over a space-time volume corresponding either to the entire video sequence or subsequences defined by a spatio-temporal grid. If there are several subsequences the different histograms are concatenated into one vector and then normalized. The combination of six spatial grids with four temporal binnings resulting in 24 possible spatio-temporal grids, is used. Each combination of a spatio-temporal grid with a descriptor, either HoG or HoF, is in the following called a channel.For classification, we use a non-linear support vector machine with a multi-channel kernel that robustly combines channels }, URL = { http://www.irisa.fr/vista/Papers/2008_cvpr_laptev.pdf}, } @CONFERENCE{Fathi_2008 Author = {Alireza Fathi and Greg Mori}, Journal = {IEEE Conference on Computer Vision and Pattern Recognition,2008} Title = {Action Recognition by Learning Mid-level Motion Features}, Year = {2008}, Abstract = { This paper developes a method for constructing mid-level motion features which are built from low-level optical flow information. The method operates on a “figurecentric” representation obtained by running a detection/tracking algorithm over the input image sequence. The input to this action recognition algorithm will be a stabilized sequence of cropped frames, centered on the human figure. Given the stabilized human figure, the Lucas and Kanade algorithm is employed to compute the optical flow for each frame. The optical flow vector field F is then split into horizontal and vertical components of the flow, Fx and Fy, each of which is then half-wave rectified into four non-negative channels Fx+, Fx-, Fy+, Fy-. Another bin is added which correspondsto zero motion F0 which is obtained by computing the L2 norm of the four basic channels.These five non-negative channels are then blurred with a gaussian and normalized.Blurring the optical flows reduces the influence of noise and small spatial shifts in the figure centric volume.For each frame, low-level motion features are extracted from optical flow channels at pixel locations in that frame and a temporal window of frames adjacent to it. Mid-level motion features are weighted combinations of thresholded low-level features. Each mid-level feature covers a small spatio-temporal cuboid, part of the whole figure-centric volume, from which its low-level features are chosen. Consider k such cuboids. A separate mid-level feature is built for each cuboid.To do this, all the low-level features that are inside that cuboid are collected and considered as potential weak classifiers of an AdaBoost run. After all iterations of the algorithm, we get the final classifier for cuboid . Finally AdaBoost is used to create a final classifier from mid-level features. Hamming decoding is used for the reduction of the k-way multi-class classification problem into binary classification. }, URL = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4587735}, }