#include #include #include #include #include #include #include #include #include #include #include inline double usecond(void) { struct timeval tv; gettimeofday(&tv,NULL); return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec; } #define GRID_SYCL_LEVEL_ZERO_IPC cl::sycl::queue *theGridAccelerator; cl::sycl::queue *tmpGridAccelerator; uint32_t acceleratorThreads(void); void acceleratorThreads(uint32_t); void acceleratorInit(void); #define accelerator #define accelerator_inline strong_inline #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ if(nt < 8)nt=8; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [[intel::reqd_sub_group_size(8)]] \ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ { __VA_ARGS__ }; \ }); \ }); #define accelerator_barrier(dummy) theGridAccelerator->wait(); #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \ accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} int acceleratorAbortOnGpuError=1; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; void acceleratorInit(void) { int nDevices = 1; cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); #endif char * localRankStr = NULL; int rank = 0, world_rank=0; #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" // We extract the local rank initialization using an environment variable if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { rank = atoi(localRankStr); } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} auto devices = cl::sycl::device::get_devices(); for(int d = 0;d().c_str()); #define GPU_PROP_FMT(prop,FMT) \ printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); GPU_PROP_STR(vendor); GPU_PROP_STR(version); GPU_PROP(global_mem_size); } if ( world_rank == 0 ) { auto name = theGridAccelerator->get_device().get_info(); printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); printf("AcceleratorSyclInit: ================================================\n"); } } void sharedMemoryInit(MPI_Comm comm); void sharedMemoryAllocate(size_t bytes); void sharedMemoryTest(size_t bytes); MPI_Comm communicator_world; void mpiInit(int *argc,char ***argv) { int flag; int provided; MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); // Never clean up as done once. MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); sharedMemoryInit(communicator_world); sharedMemoryAllocate(1024L*1024L*1024L); } std::vector WorldShmCommBufs; MPI_Comm WorldComm; int WorldRank; int WorldSize; MPI_Comm WorldShmComm; int WorldShmRank; int WorldShmSize; int WorldNodes; int WorldNode; std::vector WorldShmRanks; void sharedMemoryInit(MPI_Comm comm) { #define header "SharedMemoryMpi: " WorldComm = comm; MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_size(WorldComm,&WorldSize); // WorldComm, WorldSize, WorldRank ///////////////////////////////////////////////////////////////////// // Split into groups that can share memory ///////////////////////////////////////////////////////////////////// MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_size(WorldShmComm ,&WorldShmSize); if ( WorldRank == 0) { std::cout << header " World communicator of size " < world_ranks(WorldSize); for(int r=0;r MyGroup; MyGroup.resize(WorldShmSize); for(int rank=0;rank()); int myleader = MyGroup[0]; std::vector leaders_1hot(WorldSize,0); std::vector leaders_group(WorldNodes,0); leaders_1hot [ myleader ] = 1; /////////////////////////////////////////////////////////////////// // global sum leaders over comm world /////////////////////////////////////////////////////////////////// int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); assert(ierr==0); /////////////////////////////////////////////////////////////////// // find the group leaders world rank /////////////////////////////////////////////////////////////////// int group=0; for(int l=0;lfill((void *)ShmCommBuf, value, bytes).wait(); std::cout<< "Setting up IPC"<(theGridAccelerator->get_device()); auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); ////////////////////////////////////////////////// // If it is me, pass around the IPC access key ////////////////////////////////////////////////// typedef struct { int fd; pid_t pid ; } clone_mem_t; ze_ipc_mem_handle_t handle; clone_mem_t what_intel_should_have_done; std::cout << " sizeof(ze_ipc_mem_handle_t) is " <