We investigate the effect of hard faults on a massively-parallel implementation
of the Sparse Grid Combination Technique (SGCT), an efficient numerical
approach for the solution of high-dimensional time-dependent PDEs. The SGCT
allows us to increase the spatial resolution of a solver to a level that is out
of scope with classical discretization schemes due to the curse of
dimensionality. We exploit the inherent data redundancy of this algorithm to
obtain a scalable and fault-tolerant implementation without the need of
checkpointing or process replication. It is a lossy approach that can guarantee
convergence for a large number of faults and a wide range of applications. We
present first results using our fault simulation framework – and the first
convergence and scalability results with simulated faults and algorithm-based
fault tolerance for PDEs in more than three dimensions.
%0 Conference Paper
%1 heene2017massivelyparallel
%A Heene, Mario
%A Hinojosa, Alfredo Parra
%A Bungartz, Hans-Joachim
%A Pflüger, Dirk
%B Euro-Par 2016: Parallel Processing Workshops
%C Cham
%D 2017
%E Desprez, F.
%E al., Et
%I Springer
%K imported from:leiterrl
%P 635--647
%R 10.1007/978-3-319-58943-5_51
%T A Massively-Parallel, Fault-Tolerant Solver for High-Dimensional PDEs
%U http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2017-31&engl=0
%V 10104
%X We investigate the effect of hard faults on a massively-parallel implementation
of the Sparse Grid Combination Technique (SGCT), an efficient numerical
approach for the solution of high-dimensional time-dependent PDEs. The SGCT
allows us to increase the spatial resolution of a solver to a level that is out
of scope with classical discretization schemes due to the curse of
dimensionality. We exploit the inherent data redundancy of this algorithm to
obtain a scalable and fault-tolerant implementation without the need of
checkpointing or process replication. It is a lossy approach that can guarantee
convergence for a large number of faults and a wide range of applications. We
present first results using our fault simulation framework – and the first
convergence and scalability results with simulated faults and algorithm-based
fault tolerance for PDEs in more than three dimensions.
@inproceedings{heene2017massivelyparallel,
abstract = {We investigate the effect of hard faults on a massively-parallel implementation
of the Sparse Grid Combination Technique (SGCT), an efficient numerical
approach for the solution of high-dimensional time-dependent PDEs. The SGCT
allows us to increase the spatial resolution of a solver to a level that is out
of scope with classical discretization schemes due to the curse of
dimensionality. We exploit the inherent data redundancy of this algorithm to
obtain a scalable and fault-tolerant implementation without the need of
checkpointing or process replication. It is a lossy approach that can guarantee
convergence for a large number of faults and a wide range of applications. We
present first results using our fault simulation framework {\^a}€“ and the first
convergence and scalability results with simulated faults and algorithm-based
fault tolerance for PDEs in more than three dimensions.},
added-at = {2020-07-27T15:42:33.000+0200},
address = {Cham},
author = {Heene, Mario and Hinojosa, Alfredo Parra and Bungartz, Hans-Joachim and Pfl{\"u}ger, Dirk},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/29680229a91f3878c18457f74bab04078/ipvs-sc},
booktitle = {Euro-Par 2016: Parallel Processing Workshops},
cr-category = {G.4 Mathematical Software},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Simulation gro{\ss}er Systeme},
doi = {10.1007/978-3-319-58943-5_51},
editor = {Desprez, F. and al., Et},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
interhash = {5e542e3140d7407ec6fb732f6fa57ce8},
intrahash = {9680229a91f3878c18457f74bab04078},
keywords = {imported from:leiterrl},
language = {Englisch},
month = {Mai},
pages = {635--647},
publisher = {Springer},
series = {Lecture Notes in Computer Science (LNCS)},
timestamp = {2020-07-27T13:42:33.000+0200},
title = {{A Massively-Parallel, Fault-Tolerant Solver for High-Dimensional PDEs}},
type = {Konferenz-Beitrag},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2017-31&engl=0},
volume = 10104,
year = 2017
}