Commit 53758352 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

MPI: added getRankOnNode and removed MPI_Get_processor_name from selectGPU

parent 9a88469e
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries.
    - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for
      computations on Nvidia GPUs.
    - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can
      use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for
      distributed computing. For distributed CUDA computations, the library must
      be [CUDA-aware](
      a library implementing the MPI-3 standard for distributed computing (e.g.
      [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations,
      the library must be [CUDA-aware](
      https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/).

- __Libraries:__
+30 −0
Original line number Diff line number Diff line
@@ -42,5 +42,35 @@ inline void restoreRedirection()
   }
}

/**
 * \brief Returns a local rank ID of the current process within a group of
 * processes running on a shared-memory node.
 *
 * The given MPI communicator is split into groups according to the
 * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process
 * within the group is returned.
 */
inline int getRankOnNode( MPI_Comm group = AllGroup() )
{
#ifdef HAVE_MPI
   const int rank = GetRank(group);

   MPI_Info info;
   MPI_Info_create( &info );

   MPI_Comm local_comm;
   MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm );

   const int local_rank = GetRank( local_comm );

   MPI_Comm_free(&local_comm);
   MPI_Info_free(&info);

   return local_rank;
#else
   return 0;
#endif
}

} // namespace MPI
} // namespace TNL
+6 −1
Original line number Diff line number Diff line
@@ -22,11 +22,13 @@

#include <TNL/Assert.h>
#include "getDataType.h"
#include "selectGPU.h"

namespace TNL {
namespace MPI {

// forward declaration to break cyclic inclusion
inline void selectGPU();

// function wrappers for MPI constants

inline MPI_Comm AllGroup()
@@ -345,3 +347,6 @@ void Alltoall( const T* sendData,

} // namespace MPI
} // namespace TNL

// late inclusion to break cyclic inclusion
#include "selectGPU.h"
+5 −40
Original line number Diff line number Diff line
@@ -10,63 +10,28 @@

#pragma once

#include <cstring>

#include <TNL/Cuda/CheckDevice.h>

#include "Utils.h"

namespace TNL {
namespace MPI {
namespace {

#ifdef HAVE_MPI
#ifdef HAVE_CUDA
   typedef struct __attribute__((__packed__)) {
      char name[MPI_MAX_PROCESSOR_NAME];
   } procName;
#endif
#endif

inline void selectGPU()
{
#ifdef HAVE_MPI
#ifdef HAVE_CUDA
   int size;
   MPI_Comm_size( MPI_COMM_WORLD, &size );
   int rank;
   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
   int gpuCount;
   cudaGetDeviceCount(&gpuCount);

   procName names[size];

   int i=0;
   int len;
   MPI_Get_processor_name(names[rank].name, &len);

   for(i=0;i<size;i++)
      std::memcpy(names[i].name,names[rank].name,len+1);

   MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
      (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
               MPI_COMM_WORLD);

   int nodeRank=0;
   for(i=0;i<rank;i++)
   {
      if(std::strcmp(names[rank].name,names[i].name)==0)
         nodeRank++;
   }

   const int gpuNumber = nodeRank % gpuCount;
   const int local_rank = getRankOnNode();
   const int gpuNumber = local_rank % gpuCount;

   cudaSetDevice(gpuNumber);
   TNL_CHECK_CUDA_DEVICE;

   //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
#endif
#endif
}

} // namespace <unnamed>
} // namespace MPI
} // namespace TNL