MPI: added getRankOnNode and removed MPI_Get_processor_name from selectGPU (53758352) · Commits · TNL / tnl-dev

Documentation/Pages/main-page.md

+3 −3

Original line number	Diff line number	Diff line
		@@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries.
		- [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for
		computations on Nvidia GPUs.
		- [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can
		use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for
		distributed computing. For distributed CUDA computations, the library must
		be [CUDA-aware](
		a library implementing the MPI-3 standard for distributed computing (e.g.
		[OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations,
		the library must be [CUDA-aware](
		https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/).

		- __Libraries:__

+30 −0

Original line number	Diff line number	Diff line
		@@ -42,5 +42,35 @@ inline void restoreRedirection()
		}
		}

		/**
		* \brief Returns a local rank ID of the current process within a group of
		* processes running on a shared-memory node.
		*
		* The given MPI communicator is split into groups according to the
		* `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process
		* within the group is returned.
		*/
		inline int getRankOnNode( MPI_Comm group = AllGroup() )
		{
		#ifdef HAVE_MPI
		const int rank = GetRank(group);

		MPI_Info info;
		MPI_Info_create( &info );

		MPI_Comm local_comm;
		MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm );

		const int local_rank = GetRank( local_comm );

		MPI_Comm_free(&local_comm);
		MPI_Info_free(&info);

		return local_rank;
		#else
		return 0;
		#endif
		}

		} // namespace MPI
		} // namespace TNL

+6 −1

Original line number	Diff line number	Diff line
		@@ -22,11 +22,13 @@

		#include <TNL/Assert.h>
		#include "getDataType.h"
		#include "selectGPU.h"

		namespace TNL {
		namespace MPI {

		// forward declaration to break cyclic inclusion
		inline void selectGPU();

		// function wrappers for MPI constants

		inline MPI_Comm AllGroup()
		@@ -345,3 +347,6 @@ void Alltoall( const T* sendData,

		} // namespace MPI
		} // namespace TNL

		// late inclusion to break cyclic inclusion
		#include "selectGPU.h"

+5 −40

Original line number	Diff line number	Diff line
		@@ -10,63 +10,28 @@

		#pragma once

		#include <cstring>

		#include <TNL/Cuda/CheckDevice.h>

		#include "Utils.h"

		namespace TNL {
		namespace MPI {
		namespace {

		#ifdef HAVE_MPI
		#ifdef HAVE_CUDA
		typedef struct __attribute__((__packed__)) {
		char name[MPI_MAX_PROCESSOR_NAME];
		} procName;
		#endif
		#endif

		inline void selectGPU()
		{
		#ifdef HAVE_MPI
		#ifdef HAVE_CUDA
		int size;
		MPI_Comm_size( MPI_COMM_WORLD, &size );
		int rank;
		MPI_Comm_rank( MPI_COMM_WORLD, &rank );
		int gpuCount;
		cudaGetDeviceCount(&gpuCount);

		procName names[size];

		int i=0;
		int len;
		MPI_Get_processor_name(names[rank].name, &len);

		for(i=0;i<size;i++)
		std::memcpy(names[i].name,names[rank].name,len+1);

		MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
		(void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
		MPI_COMM_WORLD);

		int nodeRank=0;
		for(i=0;i<rank;i++)
		{
		if(std::strcmp(names[rank].name,names[i].name)==0)
		nodeRank++;
		}

		const int gpuNumber = nodeRank % gpuCount;
		const int local_rank = getRankOnNode();
		const int gpuNumber = local_rank % gpuCount;

		cudaSetDevice(gpuNumber);
		TNL_CHECK_CUDA_DEVICE;

		//std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
		#endif
		#endif
		}

		} // namespace <unnamed>
		} // namespace MPI
		} // namespace TNL