Between a widening range of GPU vendors and the trend of having more GPUs per compute node in supercomputers such as Summit, Perlmutter, Frontier and Aurora, developing performant yet portable distributed HPC applications becomes ever more challenging. Leveraging existing solutions like Kokkos for platform-independent code and HPX for distributing the application in a task-based fashion can alleviate these challenges. However, using such frameworks in the same application requires them to work together seamlessly. In this work we present an HPX Kokkos integration that works both ways: we can integrate CPU and GPU Kokkos kernels as HPX tasks and inversely use HPX worker threads to work on Kokkos kernels. Using HPX futures makes launching and synchronizing Kokkos kernels from multiple threads easy, allowing us to move away from the more traditional fork-join model. To evaluate our integrations we ported existing Vc and CUDA kernels within an existing HPX application, Octo-Tiger, to use Kokkos instead. We achieve comparable, or better, performance than with previous Vc and CUDA kernels, showing both the viability of our HPX Kokkos integration, as well as future-proofing Octo-Tiger for a wider range of potential machines. Furthermore, we introduce event polling for synchronizing CUDA kernels (or Kokkos kernels on the respective backend) achieving speedups over the previous solution using callbacks.
%0 Conference Paper
%1 dais_beyond_2021
%A Daiß, Gregor
%A Simberg, Mikael
%A Reverdell, Auriane
%A Biddiscombe, John
%A Pollinger, Theresa
%A Kaiser, Hartmut
%A Pflüger, Dirk
%B 2021 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)
%D 2021
%K Kokkos Performance Portability Task-based hpc myown {CUDA} {GPU} {HPX}
%P 377--386
%R 10.1109/IPDPSW52791.2021.00066
%T Beyond Fork-Join: Integration of Performance Portable Kokkos Kernels with HPX
%X Between a widening range of GPU vendors and the trend of having more GPUs per compute node in supercomputers such as Summit, Perlmutter, Frontier and Aurora, developing performant yet portable distributed HPC applications becomes ever more challenging. Leveraging existing solutions like Kokkos for platform-independent code and HPX for distributing the application in a task-based fashion can alleviate these challenges. However, using such frameworks in the same application requires them to work together seamlessly. In this work we present an HPX Kokkos integration that works both ways: we can integrate CPU and GPU Kokkos kernels as HPX tasks and inversely use HPX worker threads to work on Kokkos kernels. Using HPX futures makes launching and synchronizing Kokkos kernels from multiple threads easy, allowing us to move away from the more traditional fork-join model. To evaluate our integrations we ported existing Vc and CUDA kernels within an existing HPX application, Octo-Tiger, to use Kokkos instead. We achieve comparable, or better, performance than with previous Vc and CUDA kernels, showing both the viability of our HPX Kokkos integration, as well as future-proofing Octo-Tiger for a wider range of potential machines. Furthermore, we introduce event polling for synchronizing CUDA kernels (or Kokkos kernels on the respective backend) achieving speedups over the previous solution using callbacks.
@inproceedings{dais_beyond_2021,
abstract = {Between a widening range of {GPU} vendors and the trend of having more {GPUs} per compute node in supercomputers such as Summit, Perlmutter, Frontier and Aurora, developing performant yet portable distributed {HPC} applications becomes ever more challenging. Leveraging existing solutions like Kokkos for platform-independent code and {HPX} for distributing the application in a task-based fashion can alleviate these challenges. However, using such frameworks in the same application requires them to work together seamlessly. In this work we present an {HPX} Kokkos integration that works both ways: we can integrate {CPU} and {GPU} Kokkos kernels as {HPX} tasks and inversely use {HPX} worker threads to work on Kokkos kernels. Using {HPX} futures makes launching and synchronizing Kokkos kernels from multiple threads easy, allowing us to move away from the more traditional fork-join model. To evaluate our integrations we ported existing Vc and {CUDA} kernels within an existing {HPX} application, Octo-Tiger, to use Kokkos instead. We achieve comparable, or better, performance than with previous Vc and {CUDA} kernels, showing both the viability of our {HPX} Kokkos integration, as well as future-proofing Octo-Tiger for a wider range of potential machines. Furthermore, we introduce event polling for synchronizing {CUDA} kernels (or Kokkos kernels on the respective backend) achieving speedups over the previous solution using callbacks.},
added-at = {2022-03-24T16:12:08.000+0100},
author = {Daiß, Gregor and Simberg, Mikael and Reverdell, Auriane and Biddiscombe, John and Pollinger, Theresa and Kaiser, Hartmut and Pflüger, Dirk},
biburl = {https://puma.ub.uni-stuttgart.de/bibtex/2cb291e05a1a3bbb91a404b35dfb148a0/tpollinger},
booktitle = {2021 {IEEE} International Parallel and Distributed Processing Symposium Workshops ({IPDPSW})},
doi = {10.1109/IPDPSW52791.2021.00066},
eventtitle = {2021 {IEEE} International Parallel and Distributed Processing Symposium Workshops ({IPDPSW})},
interhash = {23aeada47e005c4caa88b8ee189c767a},
intrahash = {cb291e05a1a3bbb91a404b35dfb148a0},
keywords = {Kokkos Performance Portability Task-based hpc myown {CUDA} {GPU} {HPX}},
pages = {377--386},
shorttitle = {Beyond Fork-Join},
timestamp = {2022-03-24T15:12:36.000+0100},
title = {Beyond Fork-Join: Integration of Performance Portable Kokkos Kernels with HPX},
year = 2021
}