{  
   "types" : {
      "Bookmark" : {
         "pluralLabel" : "Bookmarks"
      },
      "Publication" : {
         "pluralLabel" : "Publications"
      },
      "GoldStandardPublication" : {
         "pluralLabel" : "GoldStandardPublications"
      },
      "GoldStandardBookmark" : {
         "pluralLabel" : "GoldStandardBookmarks"
      },
      "Tag" : {
         "pluralLabel" : "Tags"
      },
      "User" : {
         "pluralLabel" : "Users"
      },
      "Group" : {
         "pluralLabel" : "Groups"
      },
      "Sphere" : {
         "pluralLabel" : "Spheres"
      }
   },
   
   "properties" : {
      "count" : {
         "valueType" : "number"
      },
      "date" : {
         "valueType" : "date"
      },
      "changeDate" : {
         "valueType" : "date"
      },
      "url" : {
         "valueType" : "url"
      },
      "id" : {
         "valueType" : "url"
      },
      "tags" : {
         "valueType" : "item"
      },
      "user" : {
         "valueType" : "item"
      }      
   },
   
   "items" : [
   	  
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/212bbebb35041c50aeb84aec3177a1311/ralfschneider",         
         "tags" : [
            "MPI","collective","communication","hybrid","memory","model","myown","programming","shared"
         ],
         
         "intraHash" : "12bbebb35041c50aeb84aec3177a1311",
         "interHash" : "942d7e4eea20dfc15b05fa5663d95a48",
         "label" : "MPI Collectives for Multi-core Clusters: Optimized Performance of the Hybrid MPI+ MPI Parallel Codes",
         "user" : "ralfschneider",
         "description" : "",
         "date" : "2021-09-28 11:26:33",
         "changeDate" : "2021-09-28 09:26:33",
         "count" : 4,
         "pub-type": "inproceedings",
         "booktitle": "Proceedings of the 48th International Conference on Parallel Processing: Workshops","series": "ICPP 2019","publisher":"ACM","address":"New York, NY, USA",
         "year": "2019", 
         "url": "https://doi.org/10.1145/3339186.3339199", 
         
         "author": [ 
            "Huan Zhou","José Gracia","Ralf Schneider"
         ],
         "authors": [
         	
            	{"first" : "Huan",	"last" : "Zhou"},
            	{"first" : "José",	"last" : "Gracia"},
            	{"first" : "Ralf",	"last" : "Schneider"}
         ],
         "pages": "18:1-18:10","abstract": "The advent of multi-/many-core processors in clusters advocates hybrid parallel programming,\r\nwhich combines Message Passing Interface (MPI) for inter-node parallelism with a shared\r\nmemory model for on-node parallelism. Compared to the traditional hybrid approach\r\nof MPI plus OpenMP, a new, but promising hybrid approach of MPI plus MPI-3 shared-memory\r\nextensions (MPI+MPI) is gaining attraction. We describe an algorithmic approach for\r\ncollective operations (with allgather and broadcast as concrete examples) in the context\r\nof hybrid MPI+MPI, so as to minimize memory consumption and memory copies. With this\r\napproach, only one memory copy is maintained and shared by on-node processes. This\r\nallows the removal of unnecessary on-node copies of replicated data that are required\r\nbetween MPI processes when the collectives are invoked in the context of pure MPI.\r\nWe compare our approach of collectives for hybrid MPI+MPI and the traditional one\r\nfor pure MPI, and also have a discussion on the synchronization that is required to\r\nguarantee data integrity. The performance of our approach has been validated on a\r\nCray XC40 system (Cray MPI) and NEC cluster (Open MPI), showing that it achieves comparable\r\nor better performance for allgather operations. We have further validated our approach\r\nwith a standard computational kernel, namely distributed matrix multiplication, and\r\na Bayesian Probabilistic Matrix Factorization code.",
         "isbn" : "9781450371964",
         
         "numpages" : "10",
         
         "articleno" : "18",
         
         "location" : "Kyoto, Japan",
         
         "doi" : "10.1145/3339186.3339199",
         
         "bibtexKey": "10.1145/3339186.3339199"

      }
,
      {
         "type" : "Publication",
         "id"   : "https://puma.ub.uni-stuttgart.de/bibtex/2dccb0302d8b765b1450e86f4b2798951/amerwafai",         
         "tags" : [
            "HLRS","SCOPE","absolute","and","calculation","cell","colored","core","distributed","dynamic","force","gather","intel","interaction","law","linked","memory","molecular","myown","newton","operation","optimization","parallel","parallelization","performance","phi","processing","range","scatter","shared","short","site","third","xeon","yellow"
         ],
         
         "intraHash" : "dccb0302d8b765b1450e86f4b2798951",
         "interHash" : "902fd5f88d25cf9f9a271091a6bbd41a",
         "label" : "Optimized Force Calculation of Molecular Dynamics Simulations for the Intel Xeon Phi",
         "user" : "amerwafai",
         "description" : "",
         "date" : "2016-01-29 09:34:55",
         "changeDate" : "2016-01-29 08:41:13",
         "count" : 1,
         "pub-type": "conference",
         "booktitle": "Euro-Par 2015: Parallel Processing","series": "LNCS","address":"Vienna, Austria",
         "year": "2015", 
         "url": "", 
         
         "author": [ 
            "Nikola Tchipev","Amer Wafai","Colin W. Glass","Wolfgang Eckhardt","Alexander Heinecke","Hans-Joachim Bungartz","Philipp Neumann"
         ],
         "authors": [
         	
            	{"first" : "Nikola",	"last" : "Tchipev"},
            	{"first" : "Amer",	"last" : "Wafai"},
            	{"first" : "Colin W.",	"last" : "Glass"},
            	{"first" : "Wolfgang",	"last" : "Eckhardt"},
            	{"first" : "Alexander",	"last" : "Heinecke"},
            	{"first" : "Hans-Joachim",	"last" : "Bungartz"},
            	{"first" : "Philipp",	"last" : "Neumann"}
         ],
         
         "editor": [ 
            "Jesper Larsson Träff","Sascha Hunold","Francesco Versaci"
         ],
         "editors": [
         	
            	{"first" : "Jesper Larsson",	"last" : "Träff"},
            	{"first" : "Sascha",	"last" : "Hunold"},
            	{"first" : "Francesco",	"last" : "Versaci"}
         ],
         "volume": "9233","abstract": "We provide details on the shared-memory parallelization for manycore architectures of the molecular dynamics framework ls1-mardyn, including an optimization of the SIMD vectorization for multi-centered molecules. The novel shared-memory parallelization scheme allows to re- tain Newton's third law optimization and exhibits very good scaling on many-core devices such as a full Xeon Phi card running 240 threads. The Xeon Phi can thus be exploited and delivers comparable performance as IvyBridge nodes in our experiments.",
         "date-added" : "2015-08-19 09:05:42 +0000",
         
         "date-modified" : "2015-08-19 09:10:27 +0000",
         
         "bibtexKey": "wafai15"

      }
	  
   ]
}