In the recent years JSON affirmed as a very popular dataformat for representing massive data collections. JSON datacollections are usually schemaless. While this ensures sev-eral advantages, the absence of schema information has im-portant negative consequences: the correctness of complexqueries and programs cannot be statically checked, userscannot rely on schema information to quickly figure out thestructural properties that could speed up the formulation ofcorrect queries, and many schema-based optimizations arenot possible.In this paper we deal with the problem of inferring aschema from massive JSON datasets. We first identify aJSON type language which is simple and, at the same time,expressive enough to capture irregularities and to give com-plete structural information about input data. We thenpresent our main contribution, which is the design of a schemainference algorithm, its theoretical study, and its implemen-tation based on Spark, enabling reasonable schema infer-ence time for massive collections. Finally, we report aboutan experimental analysis showing the effectiveness of our ap-proach in terms of execution time, precision, and concisenessof inferred schemas, and scalability.
%0 Conference Paper
%1 baazizi2017schema
%A Baazizi, Mohamed Amine
%A Ben Lahmar, Houssem
%A Colazzo, Dario
%A Ghelli, Giorgio
%A Sartiani, Carlo
%B Proceedings of the Conference on Extending Database Technology (EDBT)
%D 2017
%K 2017 D03 from:leonkokkoliadis sfbtrr161 visus
%P 222-233
%R 10.5441/002/edbt.2017.21
%T Schema Inference for Massive JSON Datasets
%U http://dx.doi.org/10.5441/002/edbt.2017.21
%X In the recent years JSON affirmed as a very popular dataformat for representing massive data collections. JSON datacollections are usually schemaless. While this ensures sev-eral advantages, the absence of schema information has im-portant negative consequences: the correctness of complexqueries and programs cannot be statically checked, userscannot rely on schema information to quickly figure out thestructural properties that could speed up the formulation ofcorrect queries, and many schema-based optimizations arenot possible.In this paper we deal with the problem of inferring aschema from massive JSON datasets. We first identify aJSON type language which is simple and, at the same time,expressive enough to capture irregularities and to give com-plete structural information about input data. We thenpresent our main contribution, which is the design of a schemainference algorithm, its theoretical study, and its implemen-tation based on Spark, enabling reasonable schema infer-ence time for massive collections. Finally, we report aboutan experimental analysis showing the effectiveness of our ap-proach in terms of execution time, precision, and concisenessof inferred schemas, and scalability.
@inproceedings{baazizi2017schema,
abstract = {In the recent years JSON affirmed as a very popular dataformat for representing massive data collections. JSON datacollections are usually schemaless. While this ensures sev-eral advantages, the absence of schema information has im-portant negative consequences: the correctness of complexqueries and programs cannot be statically checked, userscannot rely on schema information to quickly figure out thestructural properties that could speed up the formulation ofcorrect queries, and many schema-based optimizations arenot possible.In this paper we deal with the problem of inferring aschema from massive JSON datasets. We first identify aJSON type language which is simple and, at the same time,expressive enough to capture irregularities and to give com-plete structural information about input data. We thenpresent our main contribution, which is the design of a schemainference algorithm, its theoretical study, and its implemen-tation based on Spark, enabling reasonable schema infer-ence time for massive collections. Finally, we report aboutan experimental analysis showing the effectiveness of our ap-proach in terms of execution time, precision, and concisenessof inferred schemas, and scalability.},
added-at = {2020-03-06T15:45:06.000+0100},
author = {Baazizi, Mohamed Amine and Ben Lahmar, Houssem and Colazzo, Dario and Ghelli, Giorgio and Sartiani, Carlo},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/27ec5e901ce8b2960f350f4b09571e2d0/sfbtrr161},
booktitle = {Proceedings of the Conference on Extending Database Technology (EDBT)},
doi = {10.5441/002/edbt.2017.21},
interhash = {b763d537c697c140d41af837b86da5b6},
intrahash = {7ec5e901ce8b2960f350f4b09571e2d0},
keywords = {2017 D03 from:leonkokkoliadis sfbtrr161 visus},
pages = {222-233},
timestamp = {2020-10-05T11:32:53.000+0200},
title = {Schema Inference for Massive JSON Datasets},
url = {http://dx.doi.org/10.5441/002/edbt.2017.21},
year = 2017
}