Human-like attention as a supervisory signal to guide neural attention has shown significant promise but is currently limited to uni-modal integration – even for inherently multi-modal tasks such as visual question answering (VQA). We present the Multimodal Human-like Attention Network (MULAN) – the first method for multimodal integration of human-like attention on image and text during training of VQA models. MULAN integrates attention predictions from two state-of-the-art text and image saliency models into neural self-attention layers of a recent transformer-based VQA model. Through evaluations on the challenging VQAv2 dataset, we show that MULAN achieves a new state-of-the-art performance of 73.98% accuracy on test-std and 73.72% on test-dev and, at the same time, has approximately 80% fewer trainable parameters than prior work. Overall, our work underlines the potential of integrating multimodal human-like and neural attention for VQA.
%0 Report
%1 sood21_arxiv
%A Sood, Ekta
%A Kögel, Fabian
%A Müller, Philipp
%A Thomas, Dominike
%A Bâce, Mihai
%A Bulling, Andreas
%D 2021
%K misc PN7 EXC2075 PN7-5
%P 1--11
%T Multimodal Integration of Human-Like Attention in Visual Question Answering
%U https://arxiv.org/pdf/2109.13139.pdf
%X Human-like attention as a supervisory signal to guide neural attention has shown significant promise but is currently limited to uni-modal integration – even for inherently multi-modal tasks such as visual question answering (VQA). We present the Multimodal Human-like Attention Network (MULAN) – the first method for multimodal integration of human-like attention on image and text during training of VQA models. MULAN integrates attention predictions from two state-of-the-art text and image saliency models into neural self-attention layers of a recent transformer-based VQA model. Through evaluations on the challenging VQAv2 dataset, we show that MULAN achieves a new state-of-the-art performance of 73.98% accuracy on test-std and 73.72% on test-dev and, at the same time, has approximately 80% fewer trainable parameters than prior work. Overall, our work underlines the potential of integrating multimodal human-like and neural attention for VQA.
@techreport{sood21_arxiv,
abstract = {Human-like attention as a supervisory signal to guide neural attention has shown significant promise but is currently limited to uni-modal integration – even for inherently multi-modal tasks such as visual question answering (VQA). We present the Multimodal Human-like Attention Network (MULAN) – the first method for multimodal integration of human-like attention on image and text during training of VQA models. MULAN integrates attention predictions from two state-of-the-art text and image saliency models into neural self-attention layers of a recent transformer-based VQA model. Through evaluations on the challenging VQAv2 dataset, we show that MULAN achieves a new state-of-the-art performance of 73.98% accuracy on test-std and 73.72% on test-dev and, at the same time, has approximately 80% fewer trainable parameters than prior work. Overall, our work underlines the potential of integrating multimodal human-like and neural attention for VQA.},
added-at = {2025-02-28T16:06:55.000+0100},
author = {Sood, Ekta and Kögel, Fabian and Müller, Philipp and Thomas, Dominike and Bâce, Mihai and Bulling, Andreas},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/2d5ddc756c2b5f9a2cf500b26bec8aa2a/exc2075},
interhash = {af6bd1a5a36a93d625a1f8c83b975539},
intrahash = {d5ddc756c2b5f9a2cf500b26bec8aa2a},
keywords = {misc PN7 EXC2075 PN7-5},
month = {10},
note = {arxiv:2109.13139},
pages = {1--11},
timestamp = {2025-02-28T16:06:55.000+0100},
title = {Multimodal Integration of Human-Like Attention in Visual Question Answering},
url = {https://arxiv.org/pdf/2109.13139.pdf},
year = 2021
}