Modern many-core architectures with hundreds of cores provide a high computational potential. This makes them particularly interesting for scientific high-performance computing and simulation technology. Like all nano scaled semiconductor devices, many-core processors are prone to reliability harming factors like variations and soft errors. One way to improve the reliability of such systems is software-based hardware fault tolerance. Here, the software is able to detect and correct errors introduced by the hardware. In this work, we propose a software-based approach to improve the reliability of matrix operations on many-core processors. These operations are key components in many scientific applications.
%0 Conference Paper
%1 BraunW2010
%A Braun, Claus
%A Wunderlich, Hans-Joachim
%B Proceedings of the 15th IEEE European Test Symposium (ETS'10)
%D 2010
%I IEEE Computer Society
%K ABFT GPGPU GPU SimTech fault-tolerance imported myown
%P 253--253
%R http://dx.doi.org/10.1109/ETSYM.2010.5512738
%T Algorithm-Based Fault Tolerance for Many-Core Architectures
%X Modern many-core architectures with hundreds of cores provide a high computational potential. This makes them particularly interesting for scientific high-performance computing and simulation technology. Like all nano scaled semiconductor devices, many-core processors are prone to reliability harming factors like variations and soft errors. One way to improve the reliability of such systems is software-based hardware fault tolerance. Here, the software is able to detect and correct errors introduced by the hardware. In this work, we propose a software-based approach to improve the reliability of matrix operations on many-core processors. These operations are key components in many scientific applications.
@inproceedings{BraunW2010,
abstract = {Modern many-core architectures with hundreds of cores provide a high computational potential. This makes them particularly interesting for scientific high-performance computing and simulation technology. Like all nano scaled semiconductor devices, many-core processors are prone to reliability harming factors like variations and soft errors. One way to improve the reliability of such systems is software-based hardware fault tolerance. Here, the software is able to detect and correct errors introduced by the hardware. In this work, we propose a software-based approach to improve the reliability of matrix operations on many-core processors. These operations are key components in many scientific applications.},
added-at = {2018-03-19T16:15:07.000+0100},
author = {Braun, Claus and Wunderlich, Hans-Joachim},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/28ae495f897e6e1e00d2159bae7ffc325/clausbraun},
booktitle = {Proceedings of the 15th IEEE European Test Symposium (ETS'10)},
doi = {http://dx.doi.org/10.1109/ETSYM.2010.5512738},
file = {http://www.iti.uni-stuttgart.de//fileadmin/rami/files/publications/2010/ETS_BraunW2010.pdf},
interhash = {2c870c5be652c307dc565990657ae91c},
intrahash = {8ae495f897e6e1e00d2159bae7ffc325},
keywords = {ABFT GPGPU GPU SimTech fault-tolerance imported myown},
pages = {253--253},
publisher = {IEEE Computer Society},
timestamp = {2018-03-19T15:19:34.000+0100},
title = {{Algorithm-Based Fault Tolerance for Many-Core Architectures}},
year = 2010
}