Aubret, A., Schaumlöffel, T., Roig, G., & Triesch, J. (2024). Learning Object Semantic Similarity with Self-Supervision. Proceedings of the 2024 IEEE International Conference on Development and Learning (ICDL). https://doi.org/10.48550/arXiv.2405.05143
@inproceedings{aubretLearningObjectSemantic2024,
title = {Learning {{Object Semantic Similarity}} with {{Self-Supervision}}},
booktitle = {Proceedings of the 2024 {{IEEE International Conference}} on {{Development}} and {{Learning}} ({{ICDL}})},
author = {Aubret, Arthur and Schaumlöffel, Timothy and Roig, Gemma and Triesch, Jochen},
date = {2024},
eprint = {2405.05143},
eprinttype = {arXiv},
eprintclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.2405.05143},
url = {http://arxiv.org/abs/2405.05143},
urldate = {2024-07-01},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
file = {C\:\\Users\\felix\\Zotero\\storage\\X3EFC5TM\\Aubret et al. - 2024 - Learning Object Semantic Similarity with Self-Supe.pdf;C\:\\Users\\felix\\Zotero\\storage\\C43SMCQ9\\2405.html}
}
Abstract
Humans judge the similarity of two objects not just based on their visual appearance but also based on their semantic relatedness. However, it remains unclear how humans learn about semantic relationships between objects and categories. One important source of semantic knowledge is that semantically related objects frequently co-occur in the same context. For instance, forks and plates are perceived as similar, at least in part, because they are often experienced together in a “kitchen" or “eating” context. Here, we investigate whether a bio-inspired learning principle exploiting such co-occurrence statistics suffices to learn a semantically structured object representation {\em de novo} from raw visual or combined visual and linguistic input. To this end, we simulate temporal sequences of visual experience by binding together short video clips of real-world scenes showing objects in different contexts. A bio-inspired neural network model aligns close-in-time visual representations while also aligning visual and category label representations to simulate visuo-language alignment. Our results show that our model clusters object representations based on their context, e.g. kitchen or bedroom, in particular in high-level layers of the network, akin to humans. In contrast, lower-level layers tend to better reflect object identity or category. To achieve this, the model exploits two distinct strategies: the visuo-language alignment ensures that different objects of the same category are represented similarly, whereas the temporal alignment leverages that objects from the same context are frequently seen in succession to make their representations more similar. Overall, our work suggests temporal and visuo-language alignment as plausible computational principles for explaining the origins of certain forms of semantic knowledge in humans.
Ernst, M. R., López, F. M., Aubret, A., Fleming, R. W., & Triesch, J. (2024). Self-Supervised Learning of Color Constancy. Proceedings of the 2024 IEEE International Conference on Development and Learning (ICDL). http://arxiv.org/abs/2404.08127
@inproceedings{ernstSelfSupervisedLearningColor2024,
title = {Self-{{Supervised Learning}} of {{Color Constancy}}},
booktitle = {Proceedings of the 2024 {{IEEE International Conference}} on {{Development}} and {{Learning}} ({{ICDL}})},
author = {Ernst, Markus R. and López, Francisco M. and Aubret, Arthur and Fleming, Roland W. and Triesch, Jochen},
date = {2024},
eprint = {2404.08127},
eprinttype = {arXiv},
eprintclass = {cs},
publisher = {arXiv},
url = {http://arxiv.org/abs/2404.08127},
urldate = {2024-07-01},
langid = {english},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
file = {C:\Users\felix\Zotero\storage\TSYSWQ9R\2404.pdf}
}
Abstract
Color constancy (CC) describes the ability of the visual system to perceive an object as having a relatively constant color despite changes in lighting conditions. While CC and its limitations have been carefully characterized in humans, it is still unclear how the visual system acquires this ability during development. Here, we present a first study showing that CC develops in a neural network trained in a self-supervised manner through an invariance learning objective. During learning, objects are presented under changing illuminations, while the network aims to map subsequent views of the same object onto close-by latent representations. This gives rise to representations that are largely invariant to the illumination conditions, offering a plausible example of how CC could emerge during human cognitive development via a form of self-supervised learning.
Lahner, B., Dwivedi, K., Iamshchinina, P., Graumann, M., Lascelles, A., Roig, G., Gifford, A. T., Pan, B., Jin, S. Y., Ratan Murty, N. A., Kay, K., Oliva, A., & Cichy, R. (2024). Modeling Short Visual Events through the BOLD Moments Video fMRI Dataset and Metadata. Nature Communications, 15(1), 6241. https://doi.org/10.1038/s41467-024-50310-3
@article{lahnerModelingShortVisual2024,
title = {Modeling Short Visual Events through the {{BOLD}} Moments Video {{fMRI}} Dataset and Metadata},
author = {Lahner, Benjamin and Dwivedi, Kshitij and Iamshchinina, Polina and Graumann, Monika and Lascelles, Alex and Roig, Gemma and Gifford, Alessandro Thomas and Pan, Bowen and Jin, SouYoung and Ratan Murty, N. Apurva and Kay, Kendrick and Oliva, Aude and Cichy, Radoslaw},
date = {2024-07-24},
journaltitle = {Nature Communications},
shortjournal = {Nat Commun},
volume = {15},
number = {1},
pages = {6241},
publisher = {Nature Publishing Group},
issn = {2041-1723},
doi = {10.1038/s41467-024-50310-3},
url = {https://www.nature.com/articles/s41467-024-50310-3},
urldate = {2024-07-30},
langid = {english},
keywords = {Neural encoding,Perception,Visual system},
file = {C:\Users\felix\Zotero\storage\MLQBFTHX\Lahner et al. - 2024 - Modeling short visual events through the BOLD mome.pdf}
}
Abstract
Studying the neural basis of human dynamic visual perception requires extensive experimental data to evaluate the large swathes of functionally diverse brain neural networks driven by perceiving visual events. Here, we introduce the BOLD Moments Dataset (BMD), a repository of whole-brain fMRI responses to over 1000 short (3 s) naturalistic video clips of visual events across ten human subjects. We use the videos’ extensive metadata to show how the brain represents word- and sentence-level descriptions of visual events and identify correlates of video memorability scores extending into the parietal cortex. Furthermore, we reveal a match in hierarchical processing between cortical regions of interest and video-computable deep neural networks, and we showcase that BMD successfully captures temporal dynamics of visual events at second resolution. With its rich metadata, BMD offers new perspectives and accelerates research on the human brain basis of visual event perception.
Oota, S., Gupta, M., & Toneva, M. (2023). Joint Processing of Linguistic Properties in Brains and Language Models. Advances in Neural Information Processing Systems, 36, 18001–18014. https://proceedings.neurips.cc/paper_files/paper/2023/hash/3a0e2de215bd17c39ad08ba1d16c1b12-Abstract-Conference.html
@article{ootaJointProcessingLinguistic2023a,
title = {Joint Processing of Linguistic Properties in Brains and Language Models},
author = {Oota, Subbareddy and Gupta, Manish and Toneva, Mariya},
date = {2023},
journaltitle = {Advances in Neural Information Processing Systems},
volume = {36},
pages = {18001--18014},
url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/3a0e2de215bd17c39ad08ba1d16c1b12-Abstract-Conference.html},
urldate = {2024-07-30},
langid = {english},
file = {C:\Users\felix\Zotero\storage\QGZCWPRS\Oota et al. - 2023 - Joint processing of linguistic properties in brain.pdf}
}
Abstract
Oota, S. R., Çelik, E., Deniz, F., & Toneva, M. (2024, June 16). Speech Language Models Lack Important Brain-Relevant Semantics. https://doi.org/10.48550/arXiv.2311.04664
@inproceedings{ootaSpeechLanguageModels2024,
title = {Speech Language Models Lack Important Brain-Relevant Semantics},
author = {Oota, Subba Reddy and Çelik, Emin and Deniz, Fatma and Toneva, Mariya},
date = {2024-06-16},
eprint = {2311.04664},
eprinttype = {arXiv},
eprintclass = {cs, eess, q-bio},
publisher = {arXiv},
doi = {10.48550/arXiv.2311.04664},
url = {http://arxiv.org/abs/2311.04664},
urldate = {2024-07-01},
keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Electrical Engineering and Systems Science - Audio and Speech Processing,Quantitative Biology - Neurons and Cognition},
file = {C\:\\Users\\felix\\Zotero\\storage\\2W5MRDIE\\Oota et al. - 2024 - Speech language models lack important brain-releva.pdf;C\:\\Users\\felix\\Zotero\\storage\\SCRZEMZV\\2311.html}
}
Abstract
Despite known differences between reading and listening in the brain, recent work has shown that text-based language models predict both text-evoked and speech-evoked brain activity to an impressive degree. This poses the question of what types of information language models truly predict in the brain. We investigate this question via a direct approach, in which we systematically remove specific low-level stimulus features (textual, speech, and visual) from language model representations to assess their impact on alignment with fMRI brain recordings during reading and listening. Comparing these findings with speech-based language models reveals starkly different effects of low-level features on brain alignment. While text-based models show reduced alignment in early sensory regions post-removal, they retain significant predictive power in late language regions. In contrast, speech-based models maintain strong alignment in early auditory regions even after feature removal but lose all predictive power in late language regions. These results suggest that speech-based models provide insights into additional information processed by early auditory regions, but caution is needed when using them to model processing in late language regions. We make our code publicly available. [https://github.com/subbareddy248/speech-llm-brain]
Schaumlöffel, T., Aubret, A., Roig, G., & Triesch, J. (2023). Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic Play. 2023 IEEE International Conference on Development and Learning (ICDL), 67–72. https://doi.org/10.1109/ICDL55364.2023.10364409
@inproceedings{schaumloffelCaregiverTalkShapes2023,
title = {Caregiver {{Talk Shapes Toddler Vision}}: {{A Computational Study}} of {{Dyadic Play}}},
shorttitle = {Caregiver {{Talk Shapes Toddler Vision}}},
booktitle = {2023 {{IEEE International Conference}} on {{Development}} and {{Learning}} ({{ICDL}})},
author = {Schaumlöffel, Timothy and Aubret, Arthur and Roig, Gemma and Triesch, Jochen},
date = {2023-11-09},
eprint = {2312.04118},
eprinttype = {arXiv},
eprintclass = {cs},
pages = {67--72},
doi = {10.1109/ICDL55364.2023.10364409},
url = {http://arxiv.org/abs/2312.04118},
urldate = {2024-07-01},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
file = {C\:\\Users\\felix\\Zotero\\storage\\AV6UR8J8\\Schaumlöffel et al. - 2023 - Caregiver Talk Shapes Toddler Vision A Computatio.pdf;C\:\\Users\\felix\\Zotero\\storage\\WDLDI6PZ\\2312.html}
}
Abstract
Infants’ ability to recognize and categorize objects develops gradually. The second year of life is marked by both the emergence of more semantic visual representations and a better understanding of word meaning. This suggests that language input may play an important role in shaping visual representations. However, even in suitable contexts for word learning like dyadic play sessions, caregivers utterances are sparse and ambiguous, often referring to objects that are different from the one to which the child attends. Here, we systematically investigate to what extent caregivers’ utterances can nevertheless enhance visual representations. For this we propose a computational model of visual representation learning during dyadic play. We introduce a synthetic dataset of ego-centric images perceived by a toddler-agent that moves and rotates toy objects in different parts of its home environment while hearing caregivers’ utterances, modeled as captions. We propose to model toddlers’ learning as simultaneously aligning representations for 1) close-in-time images and 2) co-occurring images and utterances. We show that utterances with statistics matching those of real caregivers give rise to representations supporting improved category recognition. Our analysis reveals that a small decrease/increase in object-relevant naming frequencies can drastically impact the learned representations. This affects the attention on object names within an utterance, which is required for efficient visuo-linguistic alignment. Overall, our results support the hypothesis that caregivers’ naming utterances can improve toddlers’ visual representations.
Schaumlöffel, T., Vilas, M. G., & Roig, G. (2023). PEACS: PREFIX ENCODING FOR AUDITORY CAPTION SYNTHESIS. IEEE Transactions on Multimedia, 17(10), 1733–1746. https://dcase.community/documents/challenge2023/technical_reports/DCASE2023_Schaumloeffel_107_t6a.pdf
@article{schaumloffelPEACSPREFIXENCODING2023,
title = {{{PEACS}}: {{PREFIX ENCODING FOR AUDITORY CAPTION SYNTHESIS}}},
author = {Schaumlöffel, Timothy and Vilas, Martina G. and Roig, Gemma},
date = {2023},
journaltitle = {IEEE Transactions on Multimedia},
shortjournal = {IEEE Trans. Multimedia},
volume = {17},
number = {10},
url = {https://dcase.community/documents/challenge2023/technical_reports/DCASE2023_Schaumloeffel_107_t6a.pdf},
pages = {1733--1746},
issn = {1520-9210, 1941-0077},
langid = {english},
file = {C:\Users\felix\Zotero\storage\3YDUBNF3\Stowell et al. - 2015 - Detection and Classification of Acoustic Scenes an.pdf}
}
Abstract
This technical report describes an Automated Audio Captioning system for the Detection and Classification of Acoustic Scenes and Events (DCASE) 2023 Challenge, Task 6a (automated audio captioning). Our approach employs an encoder-decoder architecture, with the encoder utilizing a large contrastive pre-trained HTS-AT capable of handling variable-length audio segments. The decoder is based on the GPT2 model. To incorporate audio into the decoding process, we employ a light mapping network that translates audio representations into a prefix, effectively guiding the decoder’s generation process. Given the limited data availability, we pre-train our model on various audio captioning datasets and fine-tune it on Clotho. We reach a SPIDERr-FL score of 29.3 on the evaluation split of the Clotho-v2 dataset.
Vilas, M. G., Schaumlöffel, T., & Roig, G. (2023). Analyzing Vision Transformers for Image Classification in Class Embedding Space. Advances in Neural Information Processing Systems, 36, 40030–40041. https://proceedings.neurips.cc/paper_files/paper/2023/hash/7dd309df03d37643b96f5048b44da798-Abstract-Conference.html
@article{vilasAnalyzingVisionTransformers2023a,
title = {Analyzing {{Vision Transformers}} for {{Image Classification}} in {{Class Embedding Space}}},
author = {Vilas, Martina G. and Schaumlöffel, Timothy and Roig, Gemma},
date = {2023-12-15},
journaltitle = {Advances in Neural Information Processing Systems},
volume = {36},
pages = {40030--40041},
url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/7dd309df03d37643b96f5048b44da798-Abstract-Conference.html},
urldate = {2024-07-30},
langid = {english},
file = {C:\Users\felix\Zotero\storage\KML6KDAY\Vilas et al. - 2023 - Analyzing Vision Transformers for Image Classifica.pdf}
}
Abstract
Vilas, M. G., Adolfi, F., Poeppel, D., & Roig, G. (2024, June 6). Position: An Inner Interpretability Framework for AI Inspired by Lessons from Cognitive Neuroscience. Forty-First International Conference on Machine Learning. https://openreview.net/forum?id=66KmnMhGU5
@inproceedings{vilasPositionInnerInterpretability2024,
title = {Position: {{An Inner Interpretability Framework}} for {{AI Inspired}} by {{Lessons}} from {{Cognitive Neuroscience}}},
shorttitle = {Position},
booktitle = {Forty-First {{International Conference}} on {{Machine Learning}}},
author = {Vilas, Martina G. and Adolfi, Federico and Poeppel, David and Roig, Gemma},
date = {2024-06-06},
url = {https://openreview.net/forum?id=66KmnMhGU5},
urldate = {2024-07-30},
langid = {english},
file = {C:\Users\felix\Zotero\storage\TT78LFKT\Vilas et al. - 2024 - Position An Inner Interpretability Framework for .pdf}
}
Abstract
Inner Interpretability is a promising emerging field tasked with uncovering the inner mechanisms of AI systems, though how to develop these mechanistic theories is still much debated. Moreover, recent critiques raise issues that question its usefulness to advance the broader goals of AI. However, it has been overlooked that these issues resemble those that have been grappled with in another field: Cognitive Neuroscience. Here we draw the relevant connections and highlight lessons that can be transferred productively between fields. Based on these, we propose a general conceptual framework and give concrete methodological strategies for building mechanistic explanations in AI inner interpretability research. With this conceptual framework, Inner Interpretability can fend off critiques and position itself on a productive path to explain AI systems.
Background Publications
Bersch, D., Dwivedi, K., Vilas, M., Cichy, R. M., & Roig, G. (2022). Net2Brain: A Toolbox to Compare Artificial Vision Models with Human Brain Responses. https://doi.org/10.48550/arXiv.2208.09677
@online{berschNet2BrainToolboxCompare2022,
title = {{{Net2Brain}}: {{A Toolbox}} to Compare Artificial Vision Models with Human Brain Responses},
shorttitle = {{{Net2Brain}}},
author = {Bersch, Domenic and Dwivedi, Kshitij and Vilas, Martina and Cichy, Radoslaw M. and Roig, Gemma},
date = {2022-08-25},
eprint = {2208.09677},
eprinttype = {arXiv},
eprintclass = {cs, q-bio},
doi = {10.48550/arXiv.2208.09677},
url = {http://arxiv.org/abs/2208.09677},
urldate = {2024-07-01},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Quantitative Biology - Neurons and Cognition},
file = {C\:\\Users\\felix\\Zotero\\storage\\XJ6BRPEW\\Bersch et al. - 2022 - Net2Brain A Toolbox to compare artificial vision .pdf;C\:\\Users\\felix\\Zotero\\storage\\42KCDVPV\\2208.html}
}
Abstract
We introduce Net2Brain, a graphical and command-line user interface toolbox for comparing the representational spaces of artificial deep neural networks (DNNs) and human brain recordings. While different toolboxes facilitate only single functionalities or only focus on a small subset of supervised image classification models, Net2Brain allows the extraction of activations of more than 600 DNNs trained to perform a diverse range of vision-related tasks (e.g semantic segmentation, depth estimation, action recognition, etc.), over both image and video datasets. The toolbox computes the representational dissimilarity matrices (RDMs) over those activations and compares them to brain recordings using representational similarity analysis (RSA), weighted RSA, both in specific ROIs and with searchlight search. In addition, it is possible to add a new data set of stimuli and brain recordings to the toolbox for evaluation. We demonstrate the functionality and advantages of Net2Brain with an example showcasing how it can be used to test hypotheses of cognitive computational neuroscience.
Dwivedi, K., Cichy, R. M., & Roig, G. (2021). Unraveling Representations in Scene-selective Brain Regions Using Scene-Parsing Deep Neural Networks. Journal of Cognitive Neuroscience, 33(10), 2032–2043. https://doi.org/10.1162/jocn_a_01624
@article{dwivediUnravelingRepresentationsSceneselective2021,
title = {Unraveling {{Representations}} in {{Scene-selective Brain Regions Using Scene-Parsing Deep Neural Networks}}},
author = {Dwivedi, Kshitij and Cichy, Radoslaw Martin and Roig, Gemma},
date = {2021-09-01},
journaltitle = {Journal of Cognitive Neuroscience},
shortjournal = {Journal of Cognitive Neuroscience},
volume = {33},
number = {10},
pages = {2032--2043},
issn = {0898-929X},
doi = {10.1162/jocn_a_01624},
url = {https://doi.org/10.1162/jocn_a_01624},
urldate = {2024-07-01},
file = {C\:\\Users\\felix\\Zotero\\storage\\Q7NNH46Z\\Dwivedi et al. - 2021 - Unraveling Representations in Scene-selective Brai.pdf;C\:\\Users\\felix\\Zotero\\storage\\UK7HI79F\\Unraveling-Representations-in-Scene-selective.html}
}
Abstract
Visual scene perception is mediated by a set of cortical regions that respond preferentially to images of scenes, including the occipital place area (OPA) and parahippocampal place area (PPA). However, the differential contribution of OPA and PPA to scene perception remains an open research question. In this study, we take a deep neural network (DNN)-based computational approach to investigate the differences in OPA and PPA function. In a first step, we search for a computational model that predicts fMRI responses to scenes in OPA and PPA well. We find that DNNs trained to predict scene components (e.g., wall, ceiling, floor) explain higher variance uniquely in OPA and PPA than a DNN trained to predict scene category (e.g., bathroom, kitchen, office). This result is robust across several DNN architectures. On this basis, we then determine whether particular scene components predicted by DNNs differentially account for unique variance in OPA and PPA. We find that variance in OPA responses uniquely explained by the navigation-related floor component is higher compared to the variance explained by the wall and ceiling components. In contrast, PPA responses are better explained by the combination of wall and floor, that is, scene components that together contain the structure and texture of the scene. This differential sensitivity to scene components suggests differential functions of OPA and PPA in scene processing. Moreover, our results further highlight the potential of the proposed computational approach as a general tool in the investigation of the neural basis of human scene perception.
Dwivedi, K., Bonner, M. F., Cichy, R. M., & Roig, G. (2021). Unveiling Functions of the Visual Cortex Using Task-Specific Deep Neural Networks. PLOS Computational Biology, 17(8), e1009267. https://doi.org/10.1371/journal.pcbi.1009267
@article{dwivediUnveilingFunctionsVisual2021,
title = {Unveiling Functions of the Visual Cortex Using Task-Specific Deep Neural Networks},
author = {Dwivedi, Kshitij and Bonner, Michael F. and Cichy, Radoslaw Martin and Roig, Gemma},
date = {2021-08-13},
journaltitle = {PLOS Computational Biology},
shortjournal = {PLOS Computational Biology},
volume = {17},
number = {8},
pages = {e1009267},
publisher = {Public Library of Science},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1009267},
url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009267},
urldate = {2024-07-01},
langid = {english},
keywords = {Functional magnetic resonance imaging,Linear regression analysis,Neural networks,Permutation,Semantics,Sensory perception,Vision,Visual cortex},
file = {C:\Users\felix\Zotero\storage\NX3RJWGC\Dwivedi et al. - 2021 - Unveiling functions of the visual cortex using tas.pdf}
}
Abstract
The human visual cortex enables visual perception through a cascade of hierarchical computations in cortical regions with distinct functionalities. Here, we introduce an AI-driven approach to discover the functional mapping of the visual cortex. We related human brain responses to scene images measured with functional MRI (fMRI) systematically to a diverse set of deep neural networks (DNNs) optimized to perform different scene perception tasks. We found a structured mapping between DNN tasks and brain regions along the ventral and dorsal visual streams. Low-level visual tasks mapped onto early brain regions, 3-dimensional scene perception tasks mapped onto the dorsal stream, and semantic tasks mapped onto the ventral stream. This mapping was of high fidelity, with more than 60% of the explainable variance in nine key regions being explained. Together, our results provide a novel functional mapping of the human visual cortex and demonstrate the power of the computational approach.
Nicholls, V. I., Krugliak, A., Alsbury-Nealy, B., Gramann, K., & Clarke, A. (2024). Congruency Effects on Object Recognition Persist When Objects Are Placed in the Wild: An AR and Mobile EEG Study (p. 2024.05.30.596613). https://doi.org/10.1101/2024.05.30.596613
@online{nichollsCongruencyEffectsObject2024,
title = {Congruency Effects on Object Recognition Persist When Objects Are Placed in the Wild: {{An AR}} and Mobile {{EEG}} Study},
shorttitle = {Congruency Effects on Object Recognition Persist When Objects Are Placed in the Wild},
author = {Nicholls, Victoria I. and Krugliak, Alexandra and Alsbury-Nealy, Benjamin and Gramann, Klaus and Clarke, Alex},
date = {2024-05-31},
eprinttype = {bioRxiv},
eprintclass = {New Results},
pages = {2024.05.30.596613},
doi = {10.1101/2024.05.30.596613},
url = {https://www.biorxiv.org/content/10.1101/2024.05.30.596613v1},
urldate = {2024-07-01},
langid = {english},
pubstate = {prepublished},
file = {C:\Users\felix\Zotero\storage\B3TSX4P9\Nicholls et al. - 2024 - Congruency effects on object recognition persist w.pdf}
}
Abstract
Objects in expected locations are recognised faster and more accurately than objects in incongruent environments. This congruency effect has a neural component, with increased activity for objects in incongruent environments. Studies have increasingly shown differences between neural processes in realistic environments and tasks, and neural processes in the laboratory. To what extent do findings obtained from a laboratory setting translate to neural processes elicited in real-world environments? We investigated how object recognition is modulated when objects are placed in real environments using augmented reality while recording mobile EEG. Participants approached, viewed, and rated how congruent they found the objects with the environment. We found significantly higher theta-band power for objects in incongruent contexts than objects in congruent contexts. This demonstrates that real-world contexts impact on how we recognize objects, and that mobile brain imaging and augmented reality are effective tools to study cognition in the wild. Teaser Combining augmented reality with mobile brain imaging to show that real-world contexts modulate object recognition processes.
Sassenhagen, J., & Fiebach, C. J. (2020). Traces of Meaning Itself: Encoding Distributional Word Vectors in Brain Activity. Neurobiology of Language, 1(1), 54–76. https://doi.org/10.1162/nol_a_00003
@article{sassenhagenTracesMeaningItself2020,
title = {Traces of {{Meaning Itself}}: {{Encoding Distributional Word Vectors}} in {{Brain Activity}}},
shorttitle = {Traces of {{Meaning Itself}}},
author = {Sassenhagen, Jona and Fiebach, Christian J.},
date = {2020-03-01},
journaltitle = {Neurobiology of Language},
shortjournal = {Neurobiology of Language},
volume = {1},
number = {1},
pages = {54--76},
issn = {2641-4368},
doi = {10.1162/nol_a_00003},
url = {https://doi.org/10.1162/nol_a_00003},
urldate = {2024-07-01},
file = {C\:\\Users\\felix\\Zotero\\storage\\PVMI7PXW\\Sassenhagen und Fiebach - 2020 - Traces of Meaning Itself Encoding Distributional .pdf;C\:\\Users\\felix\\Zotero\\storage\\CSAP8USW\\Traces-of-Meaning-Itself-Encoding-Distributional.html}
}
Abstract
How is semantic information stored in the human mind and brain? Some philosophers and cognitive scientists argue for vectorial representations of concepts, where the meaning of a word is represented as its position in a high-dimensional neural state space. At the intersection of natural language processing and artificial intelligence, a class of very successful distributional word vector models has developed that can account for classic EEG findings of language, that is, the ease versus difficulty of integrating a word with its sentence context. However, models of semantics have to account not only for context-based word processing, but should also describe how word meaning is represented. Here, we investigate whether distributional vector representations of word meaning can model brain activity induced by words presented without context. Using EEG activity (event-related brain potentials) collected while participants in two experiments (English and German) read isolated words, we encoded and decoded word vectors taken from the family of prediction-based Word2vec algorithms. We found that, first, the position of a word in vector space allows the prediction of the pattern of corresponding neural activity over time, in particular during a time window of 300 to 500 ms after word onset. Second, distributional models perform better than a human-created taxonomic baseline model (WordNet), and this holds for several distinct vector-based models. Third, multiple latent semantic dimensions of word meaning can be decoded from brain activity. Combined, these results suggest that empiricist, prediction-based vectorial representations of meaning are a viable candidate for the representational architecture of human semantic knowledge.
Schwartz, D., Toneva, M., & Wehbe, L. (2019). Inducing Brain-Relevant Bias in Natural Language Processing Models. https://doi.org/10.48550/arXiv.1911.03268
@online{schwartzInducingBrainrelevantBias2019,
title = {Inducing Brain-Relevant Bias in Natural Language Processing Models},
author = {Schwartz, Dan and Toneva, Mariya and Wehbe, Leila},
date = {2019-10-29},
eprint = {1911.03268},
eprinttype = {arXiv},
eprintclass = {cs, q-bio},
doi = {10.48550/arXiv.1911.03268},
url = {http://arxiv.org/abs/1911.03268},
urldate = {2024-07-01},
pubstate = {prepublished},
keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Quantitative Biology - Neurons and Cognition},
file = {C\:\\Users\\felix\\Zotero\\storage\\WL2AXIT9\\Schwartz et al. - 2019 - Inducing brain-relevant bias in natural language p.pdf;C\:\\Users\\felix\\Zotero\\storage\\S57P6AF5\\1911.html}
}
Abstract
Progress in natural language processing (NLP) models that estimate representations of word sequences has recently been leveraged to improve the understanding of language processing in the brain. However, these models have not been specifically designed to capture the way the brain represents language meaning. We hypothesize that fine-tuning these models to predict recordings of brain activity of people reading text will lead to representations that encode more brain-activity-relevant language information. We demonstrate that a version of BERT, a recently introduced and powerful language model, can improve the prediction of brain activity after fine-tuning. We show that the relationship between language and brain activity learned by BERT during this fine-tuning transfers across multiple participants. We also show that, for some participants, the fine-tuned representations learned from both magnetoencephalography (MEG) and functional magnetic resonance imaging (fMRI) are better for predicting fMRI than the representations learned from fMRI alone, indicating that the learned representations capture brain-activity-relevant information that is not simply an artifact of the modality. While changes to language representations help the model predict brain activity, they also do not harm the model’s ability to perform downstream NLP tasks. Our findings are notable for research on language understanding in the brain.
Toneva, M., Mitchell, T. M., & Wehbe, L. (2022). Combining Computational Controls with Natural Text Reveals Aspects of Meaning Composition. Nature Computational Science, 2(11), 745–757. https://doi.org/10.1038/s43588-022-00354-6
@article{tonevaCombiningComputationalControls2022,
title = {Combining Computational Controls with Natural Text Reveals Aspects of Meaning Composition},
author = {Toneva, Mariya and Mitchell, Tom M. and Wehbe, Leila},
date = {2022-11},
journaltitle = {Nature Computational Science},
shortjournal = {Nat Comput Sci},
volume = {2},
number = {11},
pages = {745--757},
publisher = {Nature Publishing Group},
issn = {2662-8457},
doi = {10.1038/s43588-022-00354-6},
url = {https://www.nature.com/articles/s43588-022-00354-6},
urldate = {2024-07-01},
langid = {english},
keywords = {Computer science,Language,Neural encoding},
file = {C:\Users\felix\Zotero\storage\L8I4366D\Toneva et al. - 2022 - Combining computational controls with natural text.pdf}
}
Abstract
To study a core component of human intelligence—our ability to combine the meaning of words—neuroscientists have looked to linguistics. However, linguistic theories are insufficient to account for all brain responses reflecting linguistic composition. In contrast, we adopt a data-driven approach to study the composed meaning of words beyond their individual meaning, which we term ‘supra-word meaning’. We construct a computational representation for supra-word meaning and study its brain basis through brain recordings from two complementary imaging modalities. Using functional magnetic resonance imaging, we reveal that hubs that are thought to process lexical meaning also maintain supra-word meaning, suggesting a common substrate for lexical and combinatorial semantics. Surprisingly, we cannot detect supra-word meaning in magnetoencephalography, which suggests that composed meaning might be maintained through a different neural mechanism than the synchronized firing of pyramidal cells. This sensitivity difference has implications for past neuroimaging results and future wearable neurotechnology.
Toneva, M., & Wehbe, L. (2019). Interpreting and Improving Natural-Language Processing (in Machines) with Natural Language-Processing (in the Brain). arXiv.org. https://arxiv.org/abs/1905.11833v4
@online{tonevaInterpretingImprovingNaturallanguage2019,
title = {Interpreting and Improving Natural-Language Processing (in Machines) with Natural Language-Processing (in the Brain)},
author = {Toneva, Mariya and Wehbe, Leila},
date = {2019-05-28},
url = {https://arxiv.org/abs/1905.11833v4},
urldate = {2024-07-01},
langid = {english},
organization = {arXiv.org},
file = {C:\Users\felix\Zotero\storage\V2XIL34E\Toneva und Wehbe - 2019 - Interpreting and improving natural-language proces.pdf}
}
Abstract
Neural networks models for NLP are typically implemented without the explicit encoding of language rules and yet they are able to break one performance record after another. This has generated a lot of research interest in interpreting the representations learned by these networks. We propose here a novel interpretation approach that relies on the only processing system we have that does understand language: the human brain. We use brain imaging recordings of subjects reading complex natural text to interpret word and sequence embeddings from 4 recent NLP models - ELMo, USE, BERT and Transformer-XL. We study how their representations differ across layer depth, context length, and attention type. Our results reveal differences in the context-related representations across these models. Further, in the transformer models, we find an interaction between layer depth and context length, and between layer depth and attention type. We finally hypothesize that altering BERT to better align with brain recordings would enable it to also better understand language. Probing the altered BERT using syntactic NLP tasks reveals that the model with increased brain-alignment outperforms the original model. Cognitive neuroscientists have already begun using NLP networks to study the brain, and this work closes the loop to allow the interaction between NLP and cognitive neuroscience to be a true cross-pollination.