Non-verbal voice expressions (NVVEs) have been adopted
as a means of human-computer interaction in research studies.
However, exploring non-verbal voice-based interactions
has been constrained by the limited availability of suitable training
data and computational methods for classifying such expressions,
leading to a focus on simple binary inputs. We address
this issue with a new dataset containing 950 audio samples
comprising 6 classes of voice expressions. The data were
collected from 42 speakers who donated voice recordings. The
classifier was trained on the data using features derived from
mel-spectrograms. Furthermore, we studied the effectiveness
of data augmentation and improved over the baseline model accuracy
significantly with a test accuracy of 96.6% in a 5-fold
cross-validation. We have made CNVVE publicly accessible in
the hope that it will serve as a benchmark for future research.
%0 Conference Paper
%1 hedeshy2023cnvve
%A Hedeshy, Ramin
%A Menges, Raphael
%A Staab, Steffen
%B Proc. INTERSPEECH 2023
%D 2023
%K myown peer
%P 1553-1557
%R 10.21437/Interspeech.2023-201
%T CNVVE: Dataset and Benchmark for Classifying Non-verbal Voice
Expressions
%U https://www.isca-speech.org/archive/interspeech_2023/hedeshy23_interspeech.html
%X Non-verbal voice expressions (NVVEs) have been adopted
as a means of human-computer interaction in research studies.
However, exploring non-verbal voice-based interactions
has been constrained by the limited availability of suitable training
data and computational methods for classifying such expressions,
leading to a focus on simple binary inputs. We address
this issue with a new dataset containing 950 audio samples
comprising 6 classes of voice expressions. The data were
collected from 42 speakers who donated voice recordings. The
classifier was trained on the data using features derived from
mel-spectrograms. Furthermore, we studied the effectiveness
of data augmentation and improved over the baseline model accuracy
significantly with a test accuracy of 96.6% in a 5-fold
cross-validation. We have made CNVVE publicly accessible in
the hope that it will serve as a benchmark for future research.
@inproceedings{hedeshy2023cnvve,
abstract = {Non-verbal voice expressions (NVVEs) have been adopted
as a means of human-computer interaction in research studies.
However, exploring non-verbal voice-based interactions
has been constrained by the limited availability of suitable training
data and computational methods for classifying such expressions,
leading to a focus on simple binary inputs. We address
this issue with a new dataset containing 950 audio samples
comprising 6 classes of voice expressions. The data were
collected from 42 speakers who donated voice recordings. The
classifier was trained on the data using features derived from
mel-spectrograms. Furthermore, we studied the effectiveness
of data augmentation and improved over the baseline model accuracy
significantly with a test accuracy of 96.6% in a 5-fold
cross-validation. We have made CNVVE publicly accessible in
the hope that it will serve as a benchmark for future research.},
added-at = {2023-10-30T21:05:42.000+0100},
author = {Hedeshy, Ramin and Menges, Raphael and Staab, Steffen},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/292833d3d7c232c1f384a4721c1a6c4c3/ki},
booktitle = {Proc. INTERSPEECH 2023},
doi = {10.21437/Interspeech.2023-201},
eventdate = {August 20-24},
eventtitle = {Interspeech 2023},
interhash = {6adf2287678455a9fe702bdad6058b80},
intrahash = {92833d3d7c232c1f384a4721c1a6c4c3},
keywords = {myown peer},
month = aug,
pages = {1553-1557},
timestamp = {2023-10-30T21:24:25.000+0100},
title = {CNVVE: Dataset and Benchmark for Classifying Non-verbal Voice
Expressions},
url = {https://www.isca-speech.org/archive/interspeech_2023/hedeshy23_interspeech.html},
venue = {Dublin, Irland},
year = 2023
}