As social media constitutes a valuable source for data analysis for a wide range of applications, the need
for handling such data arises. However, the nonstandard language used on social media poses problems
for natural language processing (NLP) tools, as these are typically trained on standard language material.
We propose a text normalization approach to tackle this problem. More specifically, we investigate the
usefulness of a multimodular approach to account for the diversity of normalization issues encountered in
user-generated content (UGC). We consider three different types of UGC written in Dutch (SNS, SMS, and
tweets) and provide a detailed analysis of the performance of the different modules and the overall system.
We also apply an extrinsic evaluation by evaluating the performance of a part-of-speech tagger, lemmatizer,
and named-entity recognizer before and after normalization.
%0 Journal Article
%1 Schulz2016b
%A Schulz, Sarah
%A De Pauw, Guy
%A De Clercq, Orphée
%A Desmet, Bart
%A Hoste, Véronique
%A Daelemans, Walter
%A Macken, Lieve
%D 2016
%J ACM TIST
%K content modular normalization user-generate
%N 4
%P 61
%T Multimodular Text Normalization of Dutch User-Generated Content
%U http://dx.doi.org/10.1145/2850422
%V 7
%X As social media constitutes a valuable source for data analysis for a wide range of applications, the need
for handling such data arises. However, the nonstandard language used on social media poses problems
for natural language processing (NLP) tools, as these are typically trained on standard language material.
We propose a text normalization approach to tackle this problem. More specifically, we investigate the
usefulness of a multimodular approach to account for the diversity of normalization issues encountered in
user-generated content (UGC). We consider three different types of UGC written in Dutch (SNS, SMS, and
tweets) and provide a detailed analysis of the performance of the different modules and the overall system.
We also apply an extrinsic evaluation by evaluating the performance of a part-of-speech tagger, lemmatizer,
and named-entity recognizer before and after normalization.
@article{Schulz2016b,
abstract = {As social media constitutes a valuable source for data analysis for a wide range of applications, the need
for handling such data arises. However, the nonstandard language used on social media poses problems
for natural language processing (NLP) tools, as these are typically trained on standard language material.
We propose a text normalization approach to tackle this problem. More specifically, we investigate the
usefulness of a multimodular approach to account for the diversity of normalization issues encountered in
user-generated content (UGC). We consider three different types of UGC written in Dutch (SNS, SMS, and
tweets) and provide a detailed analysis of the performance of the different modules and the overall system.
We also apply an extrinsic evaluation by evaluating the performance of a part-of-speech tagger, lemmatizer,
and named-entity recognizer before and after normalization.},
added-at = {2016-09-21T15:18:25.000+0200},
author = {Schulz, Sarah and De Pauw, Guy and De Clercq, Orphée and Desmet, Bart and Hoste, Véronique and Daelemans, Walter and Macken, Lieve},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/2b5c8e90eb9dff3fc51414c4ad406eb2a/sarahschulz},
interhash = {825bae0683727e2fb59c3bde118b30ec},
intrahash = {b5c8e90eb9dff3fc51414c4ad406eb2a},
journal = {ACM TIST},
keywords = {content modular normalization user-generate},
month = {July},
number = 4,
pages = 61,
timestamp = {2016-09-21T13:58:00.000+0200},
title = {Multimodular Text Normalization of Dutch User-Generated Content},
url = {http://dx.doi.org/10.1145/2850422},
volume = 7,
year = 2016
}