Data lakes are on the rise as data platforms for
any kind of analytics, from data exploration to machine
learning. They achieve the required flexibility by storing
heterogeneous data in their raw format, and by avoiding the
need for pre-defined use cases. However, storing only raw data
is inefficient, as for many applications, the same data processing
has to be applied repeatedly. To foster the reuse of processing
steps, literature proposes to store data in different degrees of
processing in addition to their raw format. To this end, data
lakes are typically structured in zones. There exists various zone
models, but they are varied, vague, and no assessments are
given. It is unclear which of these zone models is applicable in a
practical data lake implementation in enterprises. In this work,
we assess existing zone models using requirements derived from
multiple representative data analytics use cases of a real-world
industry case. We identify the shortcomings of existing work
and develop a zone reference model for enterprise-grade data
lake management in a detailed manner. We assess the reference
model’s applicability through a prototypical implementation for
a real-world enterprise data lake use case. This assessment
shows that the zone reference model meets the requirements
relevant in practice and is ready for industry use.
%0 Generic
%1 giebler2020reference
%A Giebler, Corinna
%A Gröger, Christoph
%A Hoos, Eva
%A Schwarz, Holger
%A Mitschang, Bernhard
%B Proceedings of the 24th IEEE Enterprise Computing Conference (EDOC 2020)
%D 2020
%K data_lake industry_case industry_experience myown reference_model zones
%T A Zone Reference Model for Enterprise-Grade Data Lake Management
%U https://opencms.uni-stuttgart.de/fak5/ipvs/departments/as/publications/giebleca/20_zoneReferenceModel_EDOC_Preprint.pdf
%X Data lakes are on the rise as data platforms for
any kind of analytics, from data exploration to machine
learning. They achieve the required flexibility by storing
heterogeneous data in their raw format, and by avoiding the
need for pre-defined use cases. However, storing only raw data
is inefficient, as for many applications, the same data processing
has to be applied repeatedly. To foster the reuse of processing
steps, literature proposes to store data in different degrees of
processing in addition to their raw format. To this end, data
lakes are typically structured in zones. There exists various zone
models, but they are varied, vague, and no assessments are
given. It is unclear which of these zone models is applicable in a
practical data lake implementation in enterprises. In this work,
we assess existing zone models using requirements derived from
multiple representative data analytics use cases of a real-world
industry case. We identify the shortcomings of existing work
and develop a zone reference model for enterprise-grade data
lake management in a detailed manner. We assess the reference
model’s applicability through a prototypical implementation for
a real-world enterprise data lake use case. This assessment
shows that the zone reference model meets the requirements
relevant in practice and is ready for industry use.
@conference{giebler2020reference,
abstract = {Data lakes are on the rise as data platforms for
any kind of analytics, from data exploration to machine
learning. They achieve the required flexibility by storing
heterogeneous data in their raw format, and by avoiding the
need for pre-defined use cases. However, storing only raw data
is inefficient, as for many applications, the same data processing
has to be applied repeatedly. To foster the reuse of processing
steps, literature proposes to store data in different degrees of
processing in addition to their raw format. To this end, data
lakes are typically structured in zones. There exists various zone
models, but they are varied, vague, and no assessments are
given. It is unclear which of these zone models is applicable in a
practical data lake implementation in enterprises. In this work,
we assess existing zone models using requirements derived from
multiple representative data analytics use cases of a real-world
industry case. We identify the shortcomings of existing work
and develop a zone reference model for enterprise-grade data
lake management in a detailed manner. We assess the reference
model’s applicability through a prototypical implementation for
a real-world enterprise data lake use case. This assessment
shows that the zone reference model meets the requirements
relevant in practice and is ready for industry use.},
added-at = {2020-09-23T15:13:47.000+0200},
author = {Giebler, Corinna and Gröger, Christoph and Hoos, Eva and Schwarz, Holger and Mitschang, Bernhard},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/25512d3d31f61060445c79210c82be49c/corinnagiebler},
booktitle = {Proceedings of the 24th IEEE Enterprise Computing Conference (EDOC 2020)},
interhash = {cfbd27fea23ee66222b888ce85047f6f},
intrahash = {5512d3d31f61060445c79210c82be49c},
keywords = {data_lake industry_case industry_experience myown reference_model zones},
timestamp = {2021-01-18T12:48:01.000+0100},
title = {A Zone Reference Model for Enterprise-Grade Data Lake Management},
url = {https://opencms.uni-stuttgart.de/fak5/ipvs/departments/as/publications/giebleca/20_zoneReferenceModel_EDOC_Preprint.pdf},
year = 2020
}