
{  
   "types" : {
      "Bookmark" : {
         "pluralLabel" : "Bookmarks"
      },
      "Publication" : {
         "pluralLabel" : "Publications"
      },
      "GoldStandardPublication" : {
         "pluralLabel" : "GoldStandardPublications"
      },
      "GoldStandardBookmark" : {
         "pluralLabel" : "GoldStandardBookmarks"
      },
      "Tag" : {
         "pluralLabel" : "Tags"
      },
      "User" : {
         "pluralLabel" : "Users"
      },
      "Group" : {
         "pluralLabel" : "Groups"
      },
      "Sphere" : {
         "pluralLabel" : "Spheres"
      }
   },
   
   "properties" : {
      "count" : {
         "valueType" : "number"
      },
      "date" : {
         "valueType" : "date"
      },
      "changeDate" : {
         "valueType" : "date"
      },
      "url" : {
         "valueType" : "url"
      },
      "id" : {
         "valueType" : "url"
      },
      "tags" : {
         "valueType" : "item"
      },
      "user" : {
         "valueType" : "item"
      }      
   },
   
   "items" : [
   	  
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2e8ebd2f42b453167d2065725486828cb/aisa",         
         "tags" : [
            "myown","Performance","CPU","Evaluation","SVM","SYCL","GPU","AISA","exc2075"
         ],
         
         "intraHash" : "e8ebd2f42b453167d2065725486828cb",
         "interHash" : "bfbc52cd98d241445f5051b284bf6ded",
         "label" : "Evaluation of SYCL\u2019s Different Data Parallel Kernels",
         "user" : "aisa",
         "description" : "",
         "date" : "2025-06-23 09:45:25",
         "changeDate" : "2025-06-23 09:45:25",
         "count" : 10,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 12th International Workshop on OpenCL and SYCL","series": "IWOCL '24","publisher":"Association for Computing Machinery","address":"New York, NY, USA",
         "year": "2024", 
         "url": "https://doi.org/10.1145/3648115.3648130", 
         
         "author": [ 
            "Marcel Breyer","Alexander Van Craen","Dirk Pflüger"
         ],
         "authors": [
         	
            	{"first" : "Marcel",	"last" : "Breyer"},
            	{"first" : "Alexander",	"last" : "Van Craen"},
            	{"first" : "Dirk",	"last" : "Pflüger"}
         ],
         "pages": "1-4","abstract": "SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.",
         "isbn" : "9798400717901",
         
         "language" : "english",
         
         "numpages" : "4",
         
         "articleno" : "10",
         
         "location" : "Chicago, IL, USA",
         
         "doi" : "10.1145/3648115.3648130",
         
         "bibtexKey": "breyer2024evaluation"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2e8ebd2f42b453167d2065725486828cb/vancraen",         
         "tags" : [
            "AISA","CPU","Evaluation","GPU","Performance","SVM","SYCL","exc2075","myown"
         ],
         
         "intraHash" : "e8ebd2f42b453167d2065725486828cb",
         "interHash" : "bfbc52cd98d241445f5051b284bf6ded",
         "label" : "Evaluation of SYCL\u2019s Different Data Parallel Kernels",
         "user" : "vancraen",
         "description" : "",
         "date" : "2024-09-30 13:17:58",
         "changeDate" : "2025-06-23 09:45:25",
         "count" : 10,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 12th International Workshop on OpenCL and SYCL","series": "IWOCL '24","publisher":"Association for Computing Machinery","address":"New York, NY, USA",
         "year": "2024", 
         "url": "https://doi.org/10.1145/3648115.3648130", 
         
         "author": [ 
            "Marcel Breyer","Alexander Van Craen","Dirk Pflüger"
         ],
         "authors": [
         	
            	{"first" : "Marcel",	"last" : "Breyer"},
            	{"first" : "Alexander",	"last" : "Van Craen"},
            	{"first" : "Dirk",	"last" : "Pflüger"}
         ],
         "pages": "1-4","abstract": "SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.",
         "isbn" : "9798400717901",
         
         "language" : "english",
         
         "numpages" : "4",
         
         "articleno" : "10",
         
         "location" : "Chicago, IL, USA",
         
         "doi" : "10.1145/3648115.3648130",
         
         "bibtexKey": "breyer2024evaluation"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2e8ebd2f42b453167d2065725486828cb/ipvs-sgs",         
         "tags" : [
            "myown","Performance","CPU","Evaluation","SVM","SYCL","GPU","AISA","exc2075","aisa"
         ],
         
         "intraHash" : "e8ebd2f42b453167d2065725486828cb",
         "interHash" : "bfbc52cd98d241445f5051b284bf6ded",
         "label" : "Evaluation of SYCL\u2019s Different Data Parallel Kernels",
         "user" : "ipvs-sgs",
         "description" : "",
         "date" : "2024-09-30 13:17:58",
         "changeDate" : "2025-06-23 09:45:25",
         "count" : 10,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 12th International Workshop on OpenCL and SYCL","series": "IWOCL '24","publisher":"Association for Computing Machinery","address":"New York, NY, USA",
         "year": "2024", 
         "url": "https://doi.org/10.1145/3648115.3648130", 
         
         "author": [ 
            "Marcel Breyer","Alexander Van Craen","Dirk Pflüger"
         ],
         "authors": [
         	
            	{"first" : "Marcel",	"last" : "Breyer"},
            	{"first" : "Alexander",	"last" : "Van Craen"},
            	{"first" : "Dirk",	"last" : "Pflüger"}
         ],
         "pages": "1-4","abstract": "SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.",
         "isbn" : "9798400717901",
         
         "language" : "english",
         
         "numpages" : "4",
         
         "articleno" : "10",
         
         "location" : "Chicago, IL, USA",
         
         "doi" : "10.1145/3648115.3648130",
         
         "bibtexKey": "breyer2024evaluation"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2e8ebd2f42b453167d2065725486828cb/ipvs-sc",         
         "tags" : [
            "myown","Performance","CPU","Evaluation","SVM","SYCL","GPU","AISA","exc2075","aisa"
         ],
         
         "intraHash" : "e8ebd2f42b453167d2065725486828cb",
         "interHash" : "bfbc52cd98d241445f5051b284bf6ded",
         "label" : "Evaluation of SYCL\u2019s Different Data Parallel Kernels",
         "user" : "ipvs-sc",
         "description" : "",
         "date" : "2024-09-30 13:17:58",
         "changeDate" : "2025-06-23 09:45:25",
         "count" : 10,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 12th International Workshop on OpenCL and SYCL","series": "IWOCL '24","publisher":"Association for Computing Machinery","address":"New York, NY, USA",
         "year": "2024", 
         "url": "https://doi.org/10.1145/3648115.3648130", 
         
         "author": [ 
            "Marcel Breyer","Alexander Van Craen","Dirk Pflüger"
         ],
         "authors": [
         	
            	{"first" : "Marcel",	"last" : "Breyer"},
            	{"first" : "Alexander",	"last" : "Van Craen"},
            	{"first" : "Dirk",	"last" : "Pflüger"}
         ],
         "pages": "1-4","abstract": "SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.",
         "isbn" : "9798400717901",
         
         "language" : "english",
         
         "numpages" : "4",
         
         "articleno" : "10",
         
         "location" : "Chicago, IL, USA",
         
         "doi" : "10.1145/3648115.3648130",
         
         "bibtexKey": "breyer2024evaluation"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/283b888adc4b029227d9b7e97896f3ec0/isw-bibliothek",         
         "tags" : [
            "GPU","Industrial","Manufacturing","PLC","Real-time","Reinforcement","control","isw","learning","myown"
         ],
         
         "intraHash" : "83b888adc4b029227d9b7e97896f3ec0",
         "interHash" : "a39b6465c0a777b3cbd24ec6b0425355",
         "label" : "Reinforcement learning methods based on GPU accelerated industrial control hardware",
         "user" : "isw-bibliothek",
         "description" : "",
         "date" : "2021-03-19 15:13:44",
         "changeDate" : "2022-01-13 08:22:37",
         "count" : 2,
         "pub-type": "article",
         "journal": "Neural Computing and Applications",
         "year": "2021", 
         "url": "https://link.springer.com/article/10.1007/s00521-021-05848-4", 
         
         "author": [ 
            "Alexander Schmidt","Florian Schellroth","Marc Fischer","Lukas Allimant","Oliver Riedel"
         ],
         "authors": [
         	
            	{"first" : "Alexander",	"last" : "Schmidt"},
            	{"first" : "Florian",	"last" : "Schellroth"},
            	{"first" : "Marc",	"last" : "Fischer"},
            	{"first" : "Lukas",	"last" : "Allimant"},
            	{"first" : "Oliver",	"last" : "Riedel"}
         ],
         
         "doi" : "https://doi.org/10.1007/s00521-021-05848-4",
         
         "bibtexKey": "schmidt2021reinforcement"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/278feed56c1636b8fcbfd657450c145bd/clausbraun",         
         "tags" : [
            "ABFT","GPGPU","GPU","SimTech","algebra","algorithm-based","error","error-detection","fault","fault-tolerance","linear","matrix-operations","myown","simulation"
         ],
         
         "intraHash" : "78feed56c1636b8fcbfd657450c145bd",
         "interHash" : "f4aa6bff08e99d1685a2218270cadc80",
         "label" : "Algorithm-based fault tolerance for matrix operations on graphics processing units: analysis and extension to autonomous operation.",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:42:05",
         "changeDate" : "2018-03-19 15:42:05",
         "count" : 3,
         "pub-type": "phdthesis",
         
         "year": "2015", 
         "url": "", 
         
         "author": [ 
            "Claus Braun"
         ],
         "authors": [
         	
            	{"first" : "Claus",	"last" : "Braun"}
         ],
         
         "ee" : "http://d-nb.info/1075190916",
         
         "bibtexKey": "phd/dnb/Braun15"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2a6dcd392b900956871dd3cfde89cd481/clausbraun",         
         "tags" : [
            "ABFT","GPGPU","GPU","SimTech","adaptivity","algebra","algorithm-based","autonompous","error","error-correction","error-detection","fault-tolerance","linear","matrix","matrix-multiplication","metric","myown","rounding","rounding-error"
         ],
         
         "intraHash" : "a6dcd392b900956871dd3cfde89cd481",
         "interHash" : "f672f35af5f6b825bda005ca703be294",
         "label" : "A-ABFT: Autonomous Algorithm-Based Fault Tolerance for Matrix Multiplications on Graphics Processing Units",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:15:07",
         "changeDate" : "2018-03-19 15:37:08",
         "count" : 7,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 44th Annual IEEE/IFIP International Conference on Dependable Systems and Networks (DSN'14)",
         "year": "2014", 
         "url": "", 
         
         "author": [ 
            "Claus Braun","Sebastian Halder","Hans-Joachim Wunderlich"
         ],
         "authors": [
         	
            	{"first" : "Claus",	"last" : "Braun"},
            	{"first" : "Sebastian",	"last" : "Halder"},
            	{"first" : "Hans-Joachim",	"last" : "Wunderlich"}
         ],
         "pages": "443--454","abstract": "Graphics processing units (GPUs) enable large-scale scientific applications and simulations on the desktop. To allow scientific computing on GPUs with high performance and reliability requirements, the application of software-based fault tolerance is attractive. Algorithm-Based Fault Tolerance (ABFT) protects important scientific operations like matrix multiplications. However, the application to floating-point operations necessitates the runtime classification of errors into inevitable rounding errors, allowed compute errors in the magnitude of such rounding errors, and into critical errors that are larger than those and not tolerable. Hence, an ABFT scheme needs suitable rounding error bounds to detect errors reliably. The determination of such error bounds is a highly challenging task, especially since it has to be integrated tightly into the algorithm and executed autonomously with low performance overhead.\r\n In this work, A-ABFT for matrix multiplications on GPUs is introduced, which is a new, parallel ABFT scheme that determines rounding error bounds autonomously at runtime with low performance overhead and high error coverage.",
         "file" : "http://www.iti.uni-stuttgart.de/fileadmin/rami/files/publications/2014/DSN_BraunHH2014.pdf",
         
         "doi" : "http://dx.doi.org/10.1109/DSN.2014.48",
         
         "bibtexKey": "BraunHW2014"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2126f35b3dc5e36c0d63a461eb07e23c3/clausbraun",         
         "tags" : [
            "GPGPU","GPU","Markov-Chain","Monte-Carlo","SimTech","architectures","computer","heterogeneous","hybrid","molecular","myown","parallel","simulation","thermodynamics"
         ],
         
         "intraHash" : "126f35b3dc5e36c0d63a461eb07e23c3",
         "interHash" : "8b3986f798c6d3bc3d644b3a4e79b147",
         "label" : "Acceleration of Monte-Carlo Molecular Simulations on Hybrid Computing Architectures",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:15:07",
         "changeDate" : "2018-03-19 15:32:04",
         "count" : 7,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 30th IEEE International Conference on Computer Design (ICCD'12)","publisher":"IEEE Computer Society",
         "year": "2012", 
         "url": "", 
         
         "author": [ 
            "Claus Braun","Stefan Holst","Hans-Joachim Wunderlich","Juan Manuel Castillo","Joachim Gross"
         ],
         "authors": [
         	
            	{"first" : "Claus",	"last" : "Braun"},
            	{"first" : "Stefan",	"last" : "Holst"},
            	{"first" : "Hans-Joachim",	"last" : "Wunderlich"},
            	{"first" : "Juan Manuel",	"last" : "Castillo"},
            	{"first" : "Joachim",	"last" : "Gross"}
         ],
         "pages": "207--212","abstract": "Markov-Chain Monte-Carlo (MCMC) methods are an important class of simulation techniques, which execute a sequence of simulation steps, where each new step depends on the previous ones. Due to this fundamental dependency, MCMC methods are inherently hard to parallelize on any architecture. The upcoming generations of hybrid CPU/GPGPU architectures with their multi-core CPUs and tightly coupled many-core GPGPUs provide new acceleration opportunities especially for MCMC methods, if the new degrees of freedom are exploited correctly. \r\nIn this paper, the outcomes of an interdisciplinary collaboration are presented, which focused on the parallel mapping of a MCMC molecular simulation from thermodynamics to hybrid CPU/GPGPU computing systems. While the mapping is designed for upcoming hybrid architectures, the implementation of this approach on an NVIDIA Tesla system already leads to a substantial speedup of more than 87x despite the additional communication overheads.",
         "file" : "http://www.iti.uni-stuttgart.de/fileadmin/rami/files/publications/2012/ICCD_BraunHWCG2012.pdf",
         
         "doi" : "http://dx.doi.org/10.1109/ICCD.2012.6378642",
         
         "bibtexKey": "BraunHWCG2012"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/22df42dffc60148e2d03c7de55549a3dc/clausbraun",         
         "tags" : [
            "ABFT","Fehlertoleranz","GPU","Rechnerarchitekturen","SimTech","Zuverlässigkeit","myown"
         ],
         
         "intraHash" : "2df42dffc60148e2d03c7de55549a3dc",
         "interHash" : "ac4312dfc7d2a467ecb4d4c9868fe852",
         "label" : "Algorithmen-basierte Fehlertoleranz für Many-Core-Architekturen;\r\nAlgorithm-based Fault-Tolerance on Many-Core Architectures",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:15:07",
         "changeDate" : "2018-03-19 15:27:50",
         "count" : 2,
         "pub-type": "article",
         "journal": "it - Information Technology","publisher":"Oldenbourg Wissenschaftsverlag",
         "year": "2010", 
         "url": "", 
         
         "author": [ 
            "Claus Braun","Hans-Joachim Wunderlich"
         ],
         "authors": [
         	
            	{"first" : "Claus",	"last" : "Braun"},
            	{"first" : "Hans-Joachim",	"last" : "Wunderlich"}
         ],
         "volume": "52","number": "4","pages": "209--215","abstract": "Moderne Many-Core-Architekturen bieten ein sehr hohes Potenzial an Rechenleistung. Dies macht sie besonders für Anwendungen aus dem Bereich des wissenschaftlichen Hochleistungsrechnens und der Simulationstechnik attraktiv. Die Architekturen folgen dabei einem Ausführungsparadigma, das sich am besten durch den Begriff ?Many-Threading? beschreiben lässt. Wie alle nanoelektronischen Halbleiterschaltungen leiden auch Many-Core-Prozessoren potentiell unter störenden Einflüssen von transienten Fehlern (soft errors) und diversen Arten von Variationen. Diese Faktoren können die Zuverlässigkeit von Systemen negativ beeinflussen und erfordern Fehlertoleranz auf allen Ebenen, von der Hardware bis zur Software. Auf der Softwareseite stellt die Algorithmen-basierte Fehlertoleranz (ABFT) eine ausgereifte Technik zur Verbesserung der Zuverlässigkeit dar. Der Aufwand für die Anpassung dieser Technik an moderne Many-Threading-Architekturen darf jedoch keinesfalls unterschätzt werden. In diesem Beitrag wird eine effiziente und fehlertolerante Abbildung der Matrixmultiplikation auf eine moderne Many-Core-Architektur präsentiert. Die Fehlertoleranz ist dabei integraler Bestandteil der Abbildung und wird durch ein ABFT-Schema realisiert, das die Leistung nur unwesentlich beeinträchtigt.\r\nModern many-core architectures provide a high computational potential, which makes them particularly interesting for applications from the fields of scientific high-performance computing and simulation technology. The execution paradigm of these architectures is best described as \u201CMany-Threading\u201D. Like all nano-scaled semiconductor devices, many-core processors are prone to transient errors (soft errors) and different kinds of variations that can have severe impact on the reliability of such systems. Therefore, fault-tolerance has to be incorporated at all levels, from the hardware up to the software. On the software side, Algorithm-based Fault Tolerance (ABFT) is a mature technique to improve the reliability. However, significant effort is required to adapt this technique to modern many-threading architectures. In this article, an efficient and fault-tolerant mapping of the matrix multiplication to a modern many-core architecture is presented. Fault-tolerance is thereby an integral part of the mapping and implemented through an ABFT scheme with marginal impact on the overall performance.",
         "doi" : "http://dx.doi.org/10.1524/itit.2010.0593",
         
         "bibtexKey": "BraunW2010a"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2b9d42307aff55f949dce3efdc063ee86/clausbraun",         
         "tags" : [
            "Euler-Maruyama","GPU","SimTech","adaptive","aggregation","approximation","computing","heterogeneous","ligand-receptor-model","multi-timescale","myown","parallel","particle","simulation"
         ],
         
         "intraHash" : "b9d42307aff55f949dce3efdc063ee86",
         "interHash" : "8b3950b9a31a28b554ce868d67598d14",
         "label" : "Adaptive Parallel Simulation of a Two-Timescale-Model for Apoptotic Receptor-Clustering on GPUs",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:15:07",
         "changeDate" : "2018-03-19 15:21:25",
         "count" : 5,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the IEEE International Conference on Bioinformatics and Biomedicine (BIBM'14)",
         "year": "2014", 
         "url": "", 
         
         "author": [ 
            "Alexander Schöll","Claus Braun","Markus Daub","Guido Schneider","Hans-Joachim Wunderlich"
         ],
         "authors": [
         	
            	{"first" : "Alexander",	"last" : "Schöll"},
            	{"first" : "Claus",	"last" : "Braun"},
            	{"first" : "Markus",	"last" : "Daub"},
            	{"first" : "Guido",	"last" : "Schneider"},
            	{"first" : "Hans-Joachim",	"last" : "Wunderlich"}
         ],
         "pages": "424--431","abstract": "Computational biology contributes important solutions for major biological challenges. Unfortunately, most applications in computational biology are highly computeintensive and associated with extensive computing times. Biological problems of interest are often not treatable with traditional simulation models on conventional multi-core CPU systems. This interdisciplinary work introduces a new multi-timescale simulation model for apoptotic receptor-clustering and a new parallel evaluation algorithm that exploits the computational performance of heterogeneous CPU-GPU computing systems. For this purpose, the different dynamics involved in receptor-clustering are separated and simulated on two timescales. Additionally, the time step sizes are adaptively refined on each timescale independently.\r\n This new approach improves the simulation performance significantly and reduces computing times from months to hours for observation times of several seconds.",
         "file" : "http://www.iti.uni-stuttgart.de/fileadmin/rami/files/publications/2014/BIBM_SchoeBDSW2014.pdf",
         
         "doi" : "http://dx.doi.org/10.1109/BIBM.2014.6999195",
         
         "bibtexKey": "SchoeBDSW2014"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/28ae495f897e6e1e00d2159bae7ffc325/clausbraun",         
         "tags" : [
            "ABFT","GPGPU","GPU","SimTech","fault-tolerance","imported","myown"
         ],
         
         "intraHash" : "8ae495f897e6e1e00d2159bae7ffc325",
         "interHash" : "2c870c5be652c307dc565990657ae91c",
         "label" : "Algorithm-Based Fault Tolerance for Many-Core Architectures",
         "user" : "clausbraun",
         "description" : "",
         "date" : "2018-03-19 16:15:07",
         "changeDate" : "2018-03-19 15:19:34",
         "count" : 6,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 15th IEEE European Test Symposium (ETS'10)","publisher":"IEEE Computer Society",
         "year": "2010", 
         "url": "", 
         
         "author": [ 
            "Claus Braun","Hans-Joachim Wunderlich"
         ],
         "authors": [
         	
            	{"first" : "Claus",	"last" : "Braun"},
            	{"first" : "Hans-Joachim",	"last" : "Wunderlich"}
         ],
         "pages": "253--253","abstract": "Modern many-core architectures with hundreds of cores provide a high computational potential. This makes them particularly interesting for scientific high-performance computing and simulation technology. Like all nano scaled semiconductor devices, many-core processors are prone to reliability harming factors like variations and soft errors. One way to improve the reliability of such systems is software-based hardware fault tolerance. Here, the software is able to detect and correct errors introduced by the hardware. In this work, we propose a software-based approach to improve the reliability of matrix operations on many-core processors. These operations are key components in many scientific applications.",
         "file" : "http://www.iti.uni-stuttgart.de//fileadmin/rami/files/publications/2010/ETS_BraunW2010.pdf",
         
         "doi" : "http://dx.doi.org/10.1109/ETSYM.2010.5512738",
         
         "bibtexKey": "BraunW2010"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/20fa2e309a2cd7a291449888471514bdf/thomasrichter",         
         "tags" : [
            "compression","cpu","gpu","image","low-complexity","myown"
         ],
         
         "intraHash" : "0fa2e309a2cd7a291449888471514bdf",
         "interHash" : "1c214f842d3f6bafc3be0b32e40c4d75",
         "label" : "Comparison of CPU and GPU Based Coding on Low-Complexity Algorithms for Display Signals",
         "user" : "thomasrichter",
         "description" : "",
         "date" : "2016-03-10 09:18:49",
         "changeDate" : "2016-03-10 08:35:27",
         "count" : 3,
         "pub-type": "inproceedings",
         "booktitle": "Applications of Digital Image Processing XXXVI","publisher":"SPIE",
         "year": "2013", 
         "url": "http://spie.org/Publications/Proceedings/Paper/10.1117/12.2022398", 
         
         "author": [ 
            "T. Richter","S. Simon"
         ],
         "authors": [
         	
            	{"first" : "T.",	"last" : "Richter"},
            	{"first" : "S.",	"last" : "Simon"}
         ],
         
         "editor": [ 
            "Andrew G. Tescher"
         ],
         "editors": [
         	
            	{"first" : "Andrew G.",	"last" : "Tescher"}
         ],
         "volume": "8856","pages": "14 pages","abstract": "Graphics Processing Units (GPUs) are freely programmable massively parallel general purpose processing units and thus offer the opportunity to off-load heavy computations from the CPU to the GPU. One application for GPU programming is image compression, where the massively parallel nature of GPUs promises high speed benefits. This article analyzes the predicaments of data-parallel image coding on the example of two high-throughput coding algorithms. The codecs discussed here were designed to answer a call from the Video Electronics Standards Association (VESA), and require only minimal buffering at encoder and decoder side while avoiding any pixel-based feedback loops limiting the operating frequency of hardware implementations. Comparing CPU and GPU implementations of the codes show that GPU based codes are usually not considerably faster, or perform only with less than ideal rate-distortion performance. Analyzing the details of this result provides theoretical evidence that for any coding engine either parts of the entropy coding and bit-stream build-up must remain serial, or rate-distortion penalties must be paid when offloading all computations on the GPU.",
         "isbn" : "9780819497062",
         
         "doi" : "10.1117/12.2022398",
         
         "bibtexKey": "richter2013comparison"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2145f228aba15e1087964fa870e3313a5/thomasrichter",         
         "tags" : [
            "GPU","compression","image","myown"
         ],
         
         "intraHash" : "145f228aba15e1087964fa870e3313a5",
         "interHash" : "9a361f143f5124b656cc251ae274785f",
         "label" : "Coding Strategies and Performance Analysis of GPU Accelerated Image Compression",
         "user" : "thomasrichter",
         "description" : "",
         "date" : "2016-03-10 09:18:49",
         "changeDate" : "2016-03-10 08:23:53",
         "count" : 4,
         "pub-type": "inproceedings",
         "booktitle": "Picture Coding Symposium (PCS), 2013","publisher":"IEEE","address":"San Jose, CA",
         "year": "2013", 
         "url": "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6737699", 
         
         "author": [ 
            "T Richter","S. Simon"
         ],
         "authors": [
         	
            	{"first" : "T",	"last" : "Richter"},
            	{"first" : "S.",	"last" : "Simon"}
         ],
         "pages": "125-128","abstract": "Graphics Processing Units (GPUs) are freely programmable massively parallel general purpose processing units and thus offer the opportunity to off-load heavy computations from the CPU to the GPU. One application for GPU programming is image compression, where the massively parallel nature of GPUs promises high speed benefits. However, measurements with competative highly optimized CPU implementations show that GPU based codes are usually not considerably faster, or perform only with less than ideal rate-distortion performance. This article presents the predicaments of data-parallel image coding by first presenting a series of theoretical arguments that limit the performance of such implementations before advancing to existing GPU implementations demonstrating the challenges of parallel image coding. It will be argued and seen on experiments that either parts of the entropy coding and bitstream build-up must remain serial, or rate-distortion penalties must be paid when offloading all computations on the GPU.",
         "isbn" : "978-1-4799-0292-7",
         
         "doi" : "10.1109/PCS.2013.6737699",
         
         "bibtexKey": "richter2013coding"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2a4a5c9728db0683dda1b2b3318fa0c9d/amerwafai",         
         "tags" : [
            "Back","CUDA","GPGPU","GPU","HLRS","Network","Neural","Parallelisation","Propagation","SCOPE","myown"
         ],
         
         "intraHash" : "a4a5c9728db0683dda1b2b3318fa0c9d",
         "interHash" : "0f4f1b3c77004231f7c17f1c80480dfe",
         "label" : "Optimization of industrial Neural Network simulators for GPGPUs",
         "user" : "amerwafai",
         "description" : "",
         "date" : "2016-01-29 09:34:55",
         "changeDate" : "2016-01-29 08:42:48",
         "count" : 1,
         "pub-type": "inproceedings",
         "booktitle": "New Horizons in Web Based Learning","series": "LNCS","publisher":"Springer Berlin Heidelberg",
         "year": "2011", 
         "url": "", 
         
         "author": [ 
            "Mhd. Amer Wafai","Zaheer Ahmed","Rainer Keller","Sven Holzmann","Björn Sander","Michael Resch"
         ],
         "authors": [
         	
            	{"first" : "Mhd. Amer",	"last" : "Wafai"},
            	{"first" : "Zaheer",	"last" : "Ahmed"},
            	{"first" : "Rainer",	"last" : "Keller"},
            	{"first" : "Sven",	"last" : "Holzmann"},
            	{"first" : "Björn",	"last" : "Sander"},
            	{"first" : "Michael",	"last" : "Resch"}
         ],
         
         "editor": [ 
            "Dickson K. W. Chiu","Minhong Wang","Elvira Popescu","Qing Li","Rynson Lau"
         ],
         "editors": [
         	
            	{"first" : "Dickson K. W.",	"last" : "Chiu"},
            	{"first" : "Minhong",	"last" : "Wang"},
            	{"first" : "Elvira",	"last" : "Popescu"},
            	{"first" : "Qing",	"last" : "Li"},
            	{"first" : "Rynson",	"last" : "Lau"}
         ],
         "volume": "7697","pages": "21-29","abstract": "This paper introduces the porting of an industrial neural network simulator onto GPUs used in a tool-chain to sort massive amounts of E-mails and other textual data. Compared to other previous work, all steps are being executed on the GPU, achieving overall up to 33× speedup without using any cuBLAS functionality. All the time-consuming routines have been ported onto the GPU, i.e. the training-, the simulation- and the verification-phases, the training being the most time-consuming. It is planned to include these GPU-kernels into the product for special costumer's demands.",
         "date-added" : "2015-08-18 14:03:50 +0000",
         
         "date-modified" : "2015-08-18 14:20:22 +0000",
         
         "bibtexKey": "wafai12"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2397984814dd598e14dfb34e12d7da8af/amerwafai",         
         "tags" : [
            "GPU","HLRS","HPC","Hardware","Parallel","Portable","Programming","SCOPE","architecture","mown","myown"
         ],
         
         "intraHash" : "397984814dd598e14dfb34e12d7da8af",
         "interHash" : "372f16a4d9fcc1ae19a70b705f96897a",
         "label" : "Portable Codes on New HPC Architectures",
         "user" : "amerwafai",
         "description" : "",
         "date" : "2016-01-29 09:34:55",
         "changeDate" : "2016-01-29 08:39:52",
         "count" : 2,
         "pub-type": "inproceedings",
         "booktitle": "Facing the Multicore-Challenge III 2012","series": "LNCS","publisher":"Springer Berlin Heidelberg",
         "year": "2012", 
         "url": "", 
         
         "author": [ 
            "Mhd. Amer Wafai","Colin W. Glass","Christoph Niethammer"
         ],
         "authors": [
         	
            	{"first" : "Mhd. Amer",	"last" : "Wafai"},
            	{"first" : "Colin W.",	"last" : "Glass"},
            	{"first" : "Christoph",	"last" : "Niethammer"}
         ],
         
         "editor": [ 
            "Rainer Keller","David Kramer","Jan-Philipp Weiß"
         ],
         "editors": [
         	
            	{"first" : "Rainer",	"last" : "Keller"},
            	{"first" : "David",	"last" : "Kramer"},
            	{"first" : "Jan-Philipp",	"last" : "Weiß"}
         ],
         "volume": "7686","pages": "133-134",
         "date-added" : "2015-08-18 14:21:33 +0000",
         
         "date-modified" : "2015-08-19 08:21:35 +0000",
         
         "bibtexKey": "wafai13"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2ce0df7291d15f3b0f1467222f3958c4d/amerwafai",         
         "tags" : [
            "Architecture","CUDA","Compute","Device","GPU","Graphics","HLRS","Processing","SCOPE","Unified","Unit","myown"
         ],
         
         "intraHash" : "ce0df7291d15f3b0f1467222f3958c4d",
         "interHash" : "782fe775e19c904f41d1b5e185f207f1",
         "label" : "Sparse matrix vector multiplications on graphic processors",
         "user" : "amerwafai",
         "description" : "",
         "date" : "2016-01-29 09:34:55",
         "changeDate" : "2016-01-29 08:39:18",
         "count" : 1,
         "pub-type": "mastersthesis",
         "address":"Nobelstr. 19, 70569, Stuttgart",
         "year": "2009", 
         "url": "", 
         
         "author": [ 
            "Mhd. Amer Wafai"
         ],
         "authors": [
         	
            	{"first" : "Mhd. Amer",	"last" : "Wafai"}
         ],
         "abstract": "The modern computer architecture is moving towards multi-core systems. Intel processors are now coming with double or even quad cores like Xeon processor. Graphics Processing Units (GPUs) are considered to be highly parallel multi-core processors with tremendous performance. They are specially designed to deal with 3D and realtime graphics. And after the introduction of the new API, Compute Unified Device Architecture (CUDA), from NVIDA, the GPU became an attractive choice for general purpose parallel computing to solve many complex numerical problems. \r\nSparse Matrix-Vector (SpMV) multiplication is one of the most important kernels in scientific computing. Its sparsity, irregularity and indirect addressing properties present new challenges to map it to multi-core systems. \r\nThe objective of this work is to analyze the speed of execution of SpMV multiplication on NVIDIA GPUs (Tesla C1060). An algorithm based on a tailored version of ELLPACK, called Aligned-ELLPACK-R, as well as different algorithms have been developed using different storage formats. These implementations are done using the programming language CUDA. Finally the comparison of that performance has been done with respect to different implementations of SpMV on Intel Xeon E5560 processor using Jagged Diagonal Formats (JAD), ELLPACK and ELLPACK-R storage formats. \r\nThe results show the superiority of JAD storage format over the matrices used to test SpMV on conventional super scaler processors. SpMV on Tesla C1060 based on Aligned-ELLPACK-R outperforms the fastest implementation on CPU with speedup factor 13 times. It also outperforms the CUDA library based on ELLPACK with 2.3 speedup factor.",
         "date-added" : "2015-08-18 13:53:17 +0000",
         
         "date-modified" : "2015-08-18 14:02:15 +0000",
         
         "bibtexKey": "wafai09"

      }
	  
   ]
}
