Loading Documentation/Pages/main-page.md +3 −3 Original line number Diff line number Diff line Loading @@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries. - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for computations on Nvidia GPUs. - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for distributed computing. For distributed CUDA computations, the library must be [CUDA-aware]( a library implementing the MPI-3 standard for distributed computing (e.g. [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations, the library must be [CUDA-aware]( https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/). - __Libraries:__ Loading src/TNL/MPI/Utils.h +30 −0 Original line number Diff line number Diff line Loading @@ -42,5 +42,35 @@ inline void restoreRedirection() } } /** * \brief Returns a local rank ID of the current process within a group of * processes running on a shared-memory node. * * The given MPI communicator is split into groups according to the * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process * within the group is returned. */ inline int getRankOnNode( MPI_Comm group = AllGroup() ) { #ifdef HAVE_MPI const int rank = GetRank(group); MPI_Info info; MPI_Info_create( &info ); MPI_Comm local_comm; MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm ); const int local_rank = GetRank( local_comm ); MPI_Comm_free(&local_comm); MPI_Info_free(&info); return local_rank; #else return 0; #endif } } // namespace MPI } // namespace TNL src/TNL/MPI/Wrappers.h +6 −1 Original line number Diff line number Diff line Loading @@ -22,11 +22,13 @@ #include <TNL/Assert.h> #include "getDataType.h" #include "selectGPU.h" namespace TNL { namespace MPI { // forward declaration to break cyclic inclusion inline void selectGPU(); // function wrappers for MPI constants inline MPI_Comm AllGroup() Loading Loading @@ -345,3 +347,6 @@ void Alltoall( const T* sendData, } // namespace MPI } // namespace TNL // late inclusion to break cyclic inclusion #include "selectGPU.h" src/TNL/MPI/selectGPU.h +5 −40 Original line number Diff line number Diff line Loading @@ -10,63 +10,28 @@ #pragma once #include <cstring> #include <TNL/Cuda/CheckDevice.h> #include "Utils.h" namespace TNL { namespace MPI { namespace { #ifdef HAVE_MPI #ifdef HAVE_CUDA typedef struct __attribute__((__packed__)) { char name[MPI_MAX_PROCESSOR_NAME]; } procName; #endif #endif inline void selectGPU() { #ifdef HAVE_MPI #ifdef HAVE_CUDA int size; MPI_Comm_size( MPI_COMM_WORLD, &size ); int rank; MPI_Comm_rank( MPI_COMM_WORLD, &rank ); int gpuCount; cudaGetDeviceCount(&gpuCount); procName names[size]; int i=0; int len; MPI_Get_processor_name(names[rank].name, &len); for(i=0;i<size;i++) std::memcpy(names[i].name,names[rank].name,len+1); MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR, (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR, MPI_COMM_WORLD); int nodeRank=0; for(i=0;i<rank;i++) { if(std::strcmp(names[rank].name,names[i].name)==0) nodeRank++; } const int gpuNumber = nodeRank % gpuCount; const int local_rank = getRankOnNode(); const int gpuNumber = local_rank % gpuCount; cudaSetDevice(gpuNumber); TNL_CHECK_CUDA_DEVICE; //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl; #endif #endif } } // namespace <unnamed> } // namespace MPI } // namespace TNL Loading
Documentation/Pages/main-page.md +3 −3 Original line number Diff line number Diff line Loading @@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries. - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for computations on Nvidia GPUs. - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for distributed computing. For distributed CUDA computations, the library must be [CUDA-aware]( a library implementing the MPI-3 standard for distributed computing (e.g. [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations, the library must be [CUDA-aware]( https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/). - __Libraries:__ Loading
src/TNL/MPI/Utils.h +30 −0 Original line number Diff line number Diff line Loading @@ -42,5 +42,35 @@ inline void restoreRedirection() } } /** * \brief Returns a local rank ID of the current process within a group of * processes running on a shared-memory node. * * The given MPI communicator is split into groups according to the * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process * within the group is returned. */ inline int getRankOnNode( MPI_Comm group = AllGroup() ) { #ifdef HAVE_MPI const int rank = GetRank(group); MPI_Info info; MPI_Info_create( &info ); MPI_Comm local_comm; MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm ); const int local_rank = GetRank( local_comm ); MPI_Comm_free(&local_comm); MPI_Info_free(&info); return local_rank; #else return 0; #endif } } // namespace MPI } // namespace TNL
src/TNL/MPI/Wrappers.h +6 −1 Original line number Diff line number Diff line Loading @@ -22,11 +22,13 @@ #include <TNL/Assert.h> #include "getDataType.h" #include "selectGPU.h" namespace TNL { namespace MPI { // forward declaration to break cyclic inclusion inline void selectGPU(); // function wrappers for MPI constants inline MPI_Comm AllGroup() Loading Loading @@ -345,3 +347,6 @@ void Alltoall( const T* sendData, } // namespace MPI } // namespace TNL // late inclusion to break cyclic inclusion #include "selectGPU.h"
src/TNL/MPI/selectGPU.h +5 −40 Original line number Diff line number Diff line Loading @@ -10,63 +10,28 @@ #pragma once #include <cstring> #include <TNL/Cuda/CheckDevice.h> #include "Utils.h" namespace TNL { namespace MPI { namespace { #ifdef HAVE_MPI #ifdef HAVE_CUDA typedef struct __attribute__((__packed__)) { char name[MPI_MAX_PROCESSOR_NAME]; } procName; #endif #endif inline void selectGPU() { #ifdef HAVE_MPI #ifdef HAVE_CUDA int size; MPI_Comm_size( MPI_COMM_WORLD, &size ); int rank; MPI_Comm_rank( MPI_COMM_WORLD, &rank ); int gpuCount; cudaGetDeviceCount(&gpuCount); procName names[size]; int i=0; int len; MPI_Get_processor_name(names[rank].name, &len); for(i=0;i<size;i++) std::memcpy(names[i].name,names[rank].name,len+1); MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR, (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR, MPI_COMM_WORLD); int nodeRank=0; for(i=0;i<rank;i++) { if(std::strcmp(names[rank].name,names[i].name)==0) nodeRank++; } const int gpuNumber = nodeRank % gpuCount; const int local_rank = getRankOnNode(); const int gpuNumber = local_rank % gpuCount; cudaSetDevice(gpuNumber); TNL_CHECK_CUDA_DEVICE; //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl; #endif #endif } } // namespace <unnamed> } // namespace MPI } // namespace TNL