Commit 2c51cea2 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/MPI' into 'develop'

Added wrapper for MPI communicators

See merge request !132
parents b55f46c3 bdcc29bc
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -10,3 +10,18 @@
#add_subdirectory( flow )
#add_subdirectory( flow-sw )
#add_subdirectory( flow-vl )

set( CPP_TARGETS  tnl-optimize-ranks )
set( CUDA_TARGETS  tnl-optimize-ranks-cuda )

foreach( target IN ITEMS ${CPP_TARGETS} )
   add_executable( ${target} ${target}.cpp )
endforeach()
install( TARGETS ${CPP_TARGETS} RUNTIME DESTINATION bin )

if( BUILD_CUDA )
   foreach( target IN ITEMS ${CUDA_TARGETS} )
      cuda_add_executable( ${target} ${target}.cu )
   endforeach()
   install( TARGETS ${CUDA_TARGETS} RUNTIME DESTINATION bin )
endif()
+1 −0
Original line number Diff line number Diff line
tnl-optimize-ranks.cpp
 No newline at end of file
+54 −0
Original line number Diff line number Diff line
#include <TNL/MPI/ScopedInitializer.h>
#include <TNL/MPI/optimizeRanks.h>

#ifdef HAVE_CUDA
   using DeviceType = TNL::Devices::Cuda;
#else
   using DeviceType = TNL::Devices::Host;
#endif

int main( int argc, char* argv[] )
{
   TNL::MPI::ScopedInitializer mpi(argc, argv);

   const int rank = TNL::MPI::GetRank();
   const int nproc = TNL::MPI::GetSize();

   // TODO: this is only an example
   using Pattern = TNL::Matrices::DenseMatrix< int, TNL::Devices::Sequential, int >;
   Pattern comm_pattern( nproc, nproc );
   comm_pattern.setValue( 0 );
   for( int i = 0; i < nproc; i++ ) {
      // periodic
      //comm_pattern( i, (i + 1 + nproc) % nproc ) = 1;
      //comm_pattern( i, (i - 1 + nproc) % nproc ) = 1;
      // without periodic boundary
      if( i < nproc - 1 )
         comm_pattern( i, i + 1 ) = 1;
      if( i > 0 )
         comm_pattern( i, i - 1 ) = 1;
   }

   if( rank == 0 )
      std::cout << "Communication pattern:\n" << comm_pattern << std::endl;

   const TNL::MPI::Comm perm_comm = TNL::MPI::optimizeRanks< DeviceType >( MPI_COMM_WORLD, comm_pattern );

   std::cout << "rank " << rank << " remapped to " << perm_comm.rank() << std::endl;

   // measure again to verify (up to measurement errors) the optimization
   const auto cost_matrix_perm = TNL::MPI::measureAlltoallCommunicationCost< DeviceType >( perm_comm );

   if( rank == 0 ) {
      using Vector = TNL::Containers::Vector< double, TNL::Devices::Sequential, int >;
      Vector identity( nproc );
      for( int i = 0; i < nproc; i++ )
         identity[ i ] = i;

      std::cout << "cost matrix after permutation:\n" << cost_matrix_perm << std::endl;
      const auto cost = TNL::MPI::getCommunicationCosts( cost_matrix_perm, comm_pattern, identity );
      std::cout << "cost vector: " << cost << " sum " << TNL::sum( cost ) << std::endl;
   }

   return EXIT_SUCCESS;
}
+3 −1
Original line number Diff line number Diff line
@@ -3,7 +3,9 @@

// conversions have to be registered for each object file
#include "../tnl_conversions.h"
#include "TNL/MPI/Wrappers.h"

#include <TNL/MPI/Wrappers.h>
#include <TNL/MPI/ScopedInitializer.h>

// external functions
void export_DistributedMeshes( py::module & m );
+3 −3
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ struct DistributedScan
      using ValueType = typename OutputDistributedArray::ValueType;
      using DeviceType = typename OutputDistributedArray::DeviceType;

      const auto communicator = input.getCommunicator();
      const auto& communicator = input.getCommunicator();
      if( communicator != MPI_COMM_NULL ) {
         // adjust begin and end for the local range
         const auto localRange = input.getLocalRange();
@@ -49,7 +49,7 @@ struct DistributedScan
         const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );

         // exchange local results between ranks
         const int nproc = MPI::GetSize( communicator );
         const int nproc = communicator.size();
         std::unique_ptr< ValueType[] > dataForScatter{ new ValueType[ nproc ] };
         for( int i = 0; i < nproc; i++ )
            dataForScatter[ i ] = local_result;
@@ -62,7 +62,7 @@ struct DistributedScan
            rank_results, rank_results, 0, nproc, 0, reduction, identity );

         // perform the second phase, using the per-block and per-rank results
         const int rank = MPI::GetRank( communicator );
         const int rank = communicator.rank();
         Scan< DeviceType, Type, PhaseType >::performSecondPhase(
            inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, identity, rank_results[ rank ] );
      }
Loading