Merge branch 'JK/MPI' into 'develop' (2c51cea2) · Commits · TNL / tnl-dev

src/Examples/CMakeLists.txt

+15 −0

Original line number	Diff line number	Diff line
		@@ -10,3 +10,18 @@
		#add_subdirectory( flow )
		#add_subdirectory( flow-sw )
		#add_subdirectory( flow-vl )

		set( CPP_TARGETS tnl-optimize-ranks )
		set( CUDA_TARGETS tnl-optimize-ranks-cuda )

		foreach( target IN ITEMS ${CPP_TARGETS} )
		add_executable( ${target} ${target}.cpp )
		endforeach()
		install( TARGETS ${CPP_TARGETS} RUNTIME DESTINATION bin )

		if( BUILD_CUDA )
		foreach( target IN ITEMS ${CUDA_TARGETS} )
		cuda_add_executable( ${target} ${target}.cu )
		endforeach()
		install( TARGETS ${CUDA_TARGETS} RUNTIME DESTINATION bin )
		endif()

src/Examples/tnl-optimize-ranks-cuda.cu

0 → 120000

+1 −0

Original line number	Diff line number	Diff line
		tnl-optimize-ranks.cpp
		No newline at end of file

src/Examples/tnl-optimize-ranks.cpp

0 → 100644

+54 −0

Original line number	Diff line number	Diff line
		#include <TNL/MPI/ScopedInitializer.h>
		#include <TNL/MPI/optimizeRanks.h>

		#ifdef HAVE_CUDA
		using DeviceType = TNL::Devices::Cuda;
		#else
		using DeviceType = TNL::Devices::Host;
		#endif

		int main( int argc, char* argv[] )
		{
		TNL::MPI::ScopedInitializer mpi(argc, argv);

		const int rank = TNL::MPI::GetRank();
		const int nproc = TNL::MPI::GetSize();

		// TODO: this is only an example
		using Pattern = TNL::Matrices::DenseMatrix< int, TNL::Devices::Sequential, int >;
		Pattern comm_pattern( nproc, nproc );
		comm_pattern.setValue( 0 );
		for( int i = 0; i < nproc; i++ ) {
		// periodic
		//comm_pattern( i, (i + 1 + nproc) % nproc ) = 1;
		//comm_pattern( i, (i - 1 + nproc) % nproc ) = 1;
		// without periodic boundary
		if( i < nproc - 1 )
		comm_pattern( i, i + 1 ) = 1;
		if( i > 0 )
		comm_pattern( i, i - 1 ) = 1;
		}

		if( rank == 0 )
		std::cout << "Communication pattern:\n" << comm_pattern << std::endl;

		const TNL::MPI::Comm perm_comm = TNL::MPI::optimizeRanks< DeviceType >( MPI_COMM_WORLD, comm_pattern );

		std::cout << "rank " << rank << " remapped to " << perm_comm.rank() << std::endl;

		// measure again to verify (up to measurement errors) the optimization
		const auto cost_matrix_perm = TNL::MPI::measureAlltoallCommunicationCost< DeviceType >( perm_comm );

		if( rank == 0 ) {
		using Vector = TNL::Containers::Vector< double, TNL::Devices::Sequential, int >;
		Vector identity( nproc );
		for( int i = 0; i < nproc; i++ )
		identity[ i ] = i;

		std::cout << "cost matrix after permutation:\n" << cost_matrix_perm << std::endl;
		const auto cost = TNL::MPI::getCommunicationCosts( cost_matrix_perm, comm_pattern, identity );
		std::cout << "cost vector: " << cost << " sum " << TNL::sum( cost ) << std::endl;
		}

		return EXIT_SUCCESS;
		}

src/Python/pytnl/tnl_mpi/tnl_mpi.cpp

+3 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,9 @@

		// conversions have to be registered for each object file
		#include "../tnl_conversions.h"
		#include "TNL/MPI/Wrappers.h"

		#include <TNL/MPI/Wrappers.h>
		#include <TNL/MPI/ScopedInitializer.h>

		// external functions
		void export_DistributedMeshes( py::module & m );

src/TNL/Algorithms/detail/DistributedScan.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -34,7 +34,7 @@ struct DistributedScan
		using ValueType = typename OutputDistributedArray::ValueType;
		using DeviceType = typename OutputDistributedArray::DeviceType;

		const auto communicator = input.getCommunicator();
		const auto& communicator = input.getCommunicator();
		if( communicator != MPI_COMM_NULL ) {
		// adjust begin and end for the local range
		const auto localRange = input.getLocalRange();
		@@ -49,7 +49,7 @@ struct DistributedScan
		const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );

		// exchange local results between ranks
		const int nproc = MPI::GetSize( communicator );
		const int nproc = communicator.size();
		std::unique_ptr< ValueType[] > dataForScatter{ new ValueType[ nproc ] };
		for( int i = 0; i < nproc; i++ )
		dataForScatter[ i ] = local_result;
		@@ -62,7 +62,7 @@ struct DistributedScan
		rank_results, rank_results, 0, nproc, 0, reduction, identity );

		// perform the second phase, using the per-block and per-rank results
		const int rank = MPI::GetRank( communicator );
		const int rank = communicator.rank();
		Scan< DeviceType, Type, PhaseType >::performSecondPhase(
		inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, identity, rank_results[ rank ] );
		}