SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.
%0 Conference Paper
%1 breyer2024evaluation
%A Breyer, Marcel
%A Van Craen, Alexander
%A Pflüger, Dirk
%B Proceedings of the 12th International Workshop on OpenCL and SYCL
%C New York, NY, USA
%D 2024
%I Association for Computing Machinery
%K EXC2075 PN6-2(II) curated PN6
%P 1-4
%R 10.1145/3648115.3648130
%T Evaluation of SYCL’s Different Data Parallel Kernels
%U https://doi.org/10.1145/3648115.3648130
%X SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.
%@ 9798400717901
@inproceedings{breyer2024evaluation,
abstract = {SYCL provides programmers with four, and in the case of AdaptiveCpp even five, ways for calling and writing a device kernel. This paper analyzes the performance of these diverse kernel invocation types for DPC++ and AdaptiveCpp as SYCL implementations on an NVIDIA A100 GPU, an AMD Instinct MI210 GPU, and a dual-socket AMD EPYC 9274F CPU. Using the example of a kernel matrix assembly, we show why the performance can differ by a factor of 100 in the worst case on the same hardware for the same problem using different SYCL implementations and kernel invocation types.},
added-at = {2024-09-30T13:17:58.000+0200},
address = {New York, NY, USA},
articleno = {10},
author = {Breyer, Marcel and Van Craen, Alexander and Pflüger, Dirk},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/2e8ebd2f42b453167d2065725486828cb/simtech},
booktitle = {Proceedings of the 12th International Workshop on OpenCL and SYCL},
doi = {10.1145/3648115.3648130},
interhash = {bfbc52cd98d241445f5051b284bf6ded},
intrahash = {e8ebd2f42b453167d2065725486828cb},
isbn = {9798400717901},
keywords = {EXC2075 PN6-2(II) curated PN6},
language = {english},
location = {Chicago, IL, USA},
month = apr,
numpages = {4},
pages = {1-4},
publisher = {Association for Computing Machinery},
series = {IWOCL '24},
timestamp = {2025-06-23T09:45:25.000+0200},
title = {Evaluation of SYCL’s Different Data Parallel Kernels},
url = {https://doi.org/10.1145/3648115.3648130},
year = 2024
}