Emotional expressions are inherently multimodal -- integrating facial behavior, speech, and gaze -- but their automatic recognition is often limited to a single modality, e.g. speech during a phone call. While previous work proposed crossmodal emotion embeddings to improve monomodal recognition performance, despite its importance, a representation of gaze was not included. We propose a new approach to emotion recognition that incorporates an explicit representation of gaze in a crossmodal emotion embedding framework. We show that our method outperforms the previous state of the art for both audio-only and video-only emotion classification on the popular One-Minute Gradual Emotion Recognition dataset. Furthermore, we report extensive ablation experiments and provide insights into the performance of different state-of-the-art gaze representations and integration strategies. Our results not only underline the importance of gaze for emotion recognition but also demonstrate a practical and highly effective approach to leveraging gaze information for this task.
%0 Conference Paper
%1 abdou22_etra
%A Abdou, Ahmed
%A Sood, Ekta
%A Müller, Philipp
%A Bulling, Andreas
%B Proc. International Symposium on Eye Tracking Research and Applications (ETRA)
%D 2022
%K exc2075 pn7 pn7-5 selected
%P 1--18
%R 10.1145/3530879
%T Gaze-enhanced Crossmodal Embeddings for Emotion Recognition
%V 6
%X Emotional expressions are inherently multimodal -- integrating facial behavior, speech, and gaze -- but their automatic recognition is often limited to a single modality, e.g. speech during a phone call. While previous work proposed crossmodal emotion embeddings to improve monomodal recognition performance, despite its importance, a representation of gaze was not included. We propose a new approach to emotion recognition that incorporates an explicit representation of gaze in a crossmodal emotion embedding framework. We show that our method outperforms the previous state of the art for both audio-only and video-only emotion classification on the popular One-Minute Gradual Emotion Recognition dataset. Furthermore, we report extensive ablation experiments and provide insights into the performance of different state-of-the-art gaze representations and integration strategies. Our results not only underline the importance of gaze for emotion recognition but also demonstrate a practical and highly effective approach to leveraging gaze information for this task.
@inproceedings{abdou22_etra,
abstract = {Emotional expressions are inherently multimodal -- integrating facial behavior, speech, and gaze -- but their automatic recognition is often limited to a single modality, e.g. speech during a phone call. While previous work proposed crossmodal emotion embeddings to improve monomodal recognition performance, despite its importance, a representation of gaze was not included. We propose a new approach to emotion recognition that incorporates an explicit representation of gaze in a crossmodal emotion embedding framework. We show that our method outperforms the previous state of the art for both audio-only and video-only emotion classification on the popular One-Minute Gradual Emotion Recognition dataset. Furthermore, we report extensive ablation experiments and provide insights into the performance of different state-of-the-art gaze representations and integration strategies. Our results not only underline the importance of gaze for emotion recognition but also demonstrate a practical and highly effective approach to leveraging gaze information for this task.},
added-at = {2025-02-28T16:06:53.000+0100},
author = {Abdou, Ahmed and Sood, Ekta and Müller, Philipp and Bulling, Andreas},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/289a5c7ed25d6ec5d7638bcfd2a68f910/simtechpuma},
booktitle = {Proc. International Symposium on Eye Tracking Research and Applications (ETRA)},
code = {https://git.hcics.simtech.uni-stuttgart.de/public-projects/gaze-enhanced-crossmodal-embeddings-for-emotion-recognition},
doi = {10.1145/3530879},
interhash = {7e28d355a52f4f5a098453fa16f906d8},
intrahash = {89a5c7ed25d6ec5d7638bcfd2a68f910},
keywords = {exc2075 pn7 pn7-5 selected},
pages = {1--18},
timestamp = {2025-02-28T16:06:53.000+0100},
title = {Gaze-enhanced Crossmodal Embeddings for Emotion Recognition},
volume = 6,
year = 2022
}