Refactoring distributed mesh and grid synchronizers: using DistributedMesh as... (d6bd51b7) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -80,7 +80,7 @@ class tnlDirectEikonalProblem

		protected:

		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< MeshFunctionType >;
		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< Meshes::DistributedMeshes::DistributedMesh< typename MeshFunctionType::MeshType > >;
		DistributedMeshSynchronizerType synchronizer;

		MeshFunctionPointer u;

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -113,7 +113,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator,

		protected:

		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< MeshFunctionType >;
		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< Meshes::DistributedMeshes::DistributedMesh< typename MeshFunctionType::MeshType > >;
		DistributedMeshSynchronizerType synchronizer;

		const IndexType maxIterations;
		@@ -174,7 +174,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator,

		protected:

		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< MeshFunctionType >;
		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< Meshes::DistributedMeshes::DistributedMesh< typename MeshFunctionType::MeshType > >;
		DistributedMeshSynchronizerType synchronizer;

		const IndexType maxIterations;

src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h

+38 −33

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@

		#include <TNL/Meshes/Grid.h>
		#include <TNL/Containers/Array.h>
		#include <TNL/Meshes/DistributedMeshes/DistributedGrid.h>
		#include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
		#include <TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h>
		#include <TNL/Meshes/DistributedMeshes/Directions.h>
		@@ -22,21 +23,18 @@ namespace TNL {
		namespace Meshes {
		namespace DistributedMeshes {

		template< typename MeshFunction,
		int MeshDimension,
		// NOTE: this specialization works only for synchronizations on cells
		template< int MeshDimension,
		typename Index,
		typename Device,
		typename GridReal >
		class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal, Device, Index > >
		class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridReal, Device, Index > >, MeshDimension >
		{
		using RealType = typename MeshFunction::RealType;
		public:
		static constexpr int getMeshDimension() { return MeshDimension; };
		static constexpr int getNeighborCount() {return DirectionCount<MeshDimension>::get();};

		typedef typename Grid< MeshDimension, GridReal, Device, Index >::Cell Cell;
		// FIXME: clang does not like this (incomplete type error)
		// typedef typename Functions::MeshFunction< Grid< 3, GridReal, Device, Index >,EntityDimension, RealType> MeshFunctionType;
		typedef typename Grid< MeshDimension, GridReal, Device, Index >::DistributedMeshType DistributedGridType;
		typedef typename DistributedGridType::CoordinatesType CoordinatesType;
		using SubdomainOverlapsType = typename DistributedGridType::SubdomainOverlapsType;
		@@ -107,8 +105,6 @@ class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal,
		}

		sendSizes[ i ] = sendSize;
		sendBuffers[ i ].setSize( sendSize );
		recieveBuffers[ i ].setSize( sendSize);

		if( this->periodicBoundariesCopyDirection == OverlapToBoundary &&
		neighbors[ i ] == -1 )
		@@ -123,11 +119,20 @@ class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal,
		bool periodicBoundaries = false,
		const PeriodicBoundariesMaskPointer& mask = PeriodicBoundariesMaskPointer( nullptr ) )
		{
		using RealType = typename MeshFunctionType::RealType;

		static_assert( MeshFunctionType::getEntitiesDimension() == MeshFunctionType::getMeshDimension(),
		"this specialization works only for synchronizations on cells" );
		TNL_ASSERT_TRUE( isSet, "Synchronizer is not set, but used to synchronize" );

		if( !distributedGrid->isDistributed() ) return;

		// allocate buffers (setSize does nothing if the array size is already correct)
		for( int i=0; i<this->getNeighborCount(); i++ ) {
		sendBuffers[ i ].setSize( sendSizes[ i ] * sizeof(RealType) );
		recieveBuffers[ i ].setSize( sendSizes[ i ] * sizeof(RealType));
		}

		const int *neighbors = distributedGrid->getNeighbors();
		const int *periodicNeighbors = distributedGrid->getPeriodicNeighbors();

		@@ -154,16 +159,16 @@ class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal,
		if( neighbors[ i ] != -1 )
		{
		//TNL_MPI_PRINT( "Sending data to node " << neighbors[ i ] );
		requests[ requestsCount++ ] = CommunicatorType::ISend( sendBuffers[ i ].getData(), sendSizes[ i ], neighbors[ i ], 0, group );
		requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group );
		//TNL_MPI_PRINT( "Receiving data from node " << neighbors[ i ] );
		requests[ requestsCount++ ] = CommunicatorType::IRecv( recieveBuffers[ i ].getData(), sendSizes[ i ], neighbors[ i ], 0, group );
		requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group );
		}
		else if( periodicBoundaries && sendSizes[ i ] !=0 )
		{
		//TNL_MPI_PRINT( "Sending data to node " << periodicNeighbors[ i ] );
		requests[ requestsCount++ ] = CommunicatorType::ISend( sendBuffers[ i ].getData(), sendSizes[ i ], periodicNeighbors[ i ], 1, group );
		requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group );
		//TNL_MPI_PRINT( "Receiving data to node " << periodicNeighbors[ i ] );
		requests[ requestsCount++ ] = CommunicatorType::IRecv( recieveBuffers[ i ].getData(), sendSizes[ i ], periodicNeighbors[ i ], 1, group );
		requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group );
		}
		}

		@@ -182,12 +187,11 @@ class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal,
		}

		private:
		template< typename Real_,
		typename MeshFunctionType,
		template< typename MeshFunctionType,
		typename PeriodicBoundariesMaskPointer >
		void copyBuffers(
		MeshFunctionType& meshFunction,
		Containers::Array<Real_, Device, Index>* buffers,
		Containers::Array<std::uint8_t, Device, Index>* buffers,
		CoordinatesType* begins,
		CoordinatesType* sizes,
		bool toBuffer,
		@@ -195,23 +199,24 @@ class DistributedMeshSynchronizer< MeshFunction, Grid< MeshDimension, GridReal,
		bool periodicBoundaries,
		const PeriodicBoundariesMaskPointer& mask )
		{
		using Helper = BufferEntitiesHelper< MeshFunctionType, PeriodicBoundariesMaskPointer, getMeshDimension(), Real_, Device >;
		using RealType = typename MeshFunctionType::RealType;
		using Helper = BufferEntitiesHelper< MeshFunctionType, PeriodicBoundariesMaskPointer, getMeshDimension(), RealType, Device >;

		for(int i=0;i<this->getNeighborCount();i++)
		{
		bool isBoundary=( neighbor[ i ] == -1 );
		if( ! isBoundary \|\| periodicBoundaries )
		{
		Helper::BufferEntities( meshFunction, mask, buffers[ i ].getData(), isBoundary, begins[i], sizes[i], toBuffer );
		Helper::BufferEntities( meshFunction, mask, reinterpret_cast<RealType*>( buffers[ i ].getData() ), isBoundary, begins[i], sizes[i], toBuffer );
		}
		}
		}

		private:

		Containers::Array<RealType, Device, Index> sendBuffers[getNeighborCount()];
		Containers::Array<RealType, Device, Index> recieveBuffers[getNeighborCount()];
		Containers::StaticArray< getNeighborCount(), int > sendSizes;
		Containers::Array< std::uint8_t, Device, Index > sendBuffers[getNeighborCount()];
		Containers::Array< std::uint8_t, Device, Index > recieveBuffers[getNeighborCount()];

		PeriodicBoundariesCopyDirection periodicBoundariesCopyDirection = BoundaryToOverlap;

src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h

+61 −80

Original line number	Diff line number	Diff line
		@@ -12,39 +12,26 @@

		#pragma once

		#include <TNL/Containers/Vector.h>
		#include <TNL/Matrices/DenseMatrix.h>
		#include <TNL/Communicators/MpiCommunicator.h>
		#include <TNL/Exceptions/NotImplementedError.h>

		namespace TNL {
		namespace Meshes {
		namespace DistributedMeshes {

		template< typename MeshFunction,
		// Mesh is used only for DistributedGrid specializations
		typename Mesh = typename MeshFunction::MeshType >
		template< typename DistributedMesh,
		int EntityDimension = DistributedMesh::getMeshDimension() >
		class DistributedMeshSynchronizer
		{
		public:
		// TODO: generalize
		static constexpr int EntityDimension = MeshFunction::getEntitiesDimension();
		static_assert( EntityDimension == 0 \|\| EntityDimension == MeshFunction::getMeshDimension(),
		"Synchronization for entities of the specified dimension is not implemented yet." );

		using RealType = typename MeshFunction::RealType;
		using DeviceType = typename MeshFunction::DeviceType;
		using IndexType = typename MeshFunction::IndexType;
		using CommunicatorType = Communicators::MpiCommunicator;
		using DeviceType = typename DistributedMesh::DeviceType;
		using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
		using CommunicatorType = typename DistributedMesh::CommunicatorType;

		DistributedMeshSynchronizer() = default;

		template< typename DistributedMesh >
		void initialize( const DistributedMesh& mesh )
		{
		static_assert( std::is_same< typename DistributedMesh::MeshType, typename MeshFunction::MeshType >::value,
		"The type of the local mesh does not match the type of the mesh function's mesh." );
		static_assert( std::is_same< typename DistributedMesh::CommunicatorType, CommunicatorType >::value,
		"DistributedMesh::CommunnicatorType does not match" );
		if( mesh.getGhostLevels() <= 0 )
		throw std::logic_error( "There are no ghost levels on the distributed mesh." );
		localEntitiesCount = mesh.getLocalMesh().template getEntitiesCount< EntityDimension >();
		@@ -61,19 +48,18 @@ public:

		// exchange the global index offsets so that each rank can determine the
		// owner of every entity by its global index
		const IndexType ownStart = mesh.template getGlobalIndices< EntityDimension >().getElement( 0 );
		Containers::Array< IndexType, Devices::Host, int > offsets( nproc );
		const GlobalIndexType ownStart = mesh.template getGlobalIndices< EntityDimension >().getElement( 0 );
		Containers::Array< GlobalIndexType, Devices::Host, int > offsets( nproc );
		{
		Containers::Array< IndexType, Devices::Host, int > sendbuf( nproc );
		Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc );
		sendbuf.setValue( ownStart );
		CommunicatorType::Alltoall( sendbuf.getData(), 1,
		offsets.getData(), 1,
		group );
		}

		auto getOwner = [&] ( IndexType idx )
		auto getOwner = [&] ( GlobalIndexType idx )
		{
		// TODO: is there a more efficient way?
		for( int i = 0; i < nproc - 1; i++ )
		if( offsets[ i ] <= idx && idx < offsets[ i + 1 ] )
		return i;
		@@ -81,19 +67,19 @@ public:
		};

		// count local ghost entities for each rank
		Containers::Array< IndexType, Devices::Host, int > localGhostCounts( nproc );
		Containers::Array< GlobalIndexType, Devices::Host, int > localGhostCounts( nproc );
		{
		localGhostCounts.setValue( 0 );
		Containers::Array< IndexType, Devices::Host, IndexType > hostGlobalIndices;
		Containers::Array< GlobalIndexType, Devices::Host, GlobalIndexType > hostGlobalIndices;
		hostGlobalIndices = mesh.template getGlobalIndices< EntityDimension >();
		IndexType prev_global_idx = 0;
		for( IndexType local_idx = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >();
		local_idx < localEntitiesCount;
		local_idx++ )
		GlobalIndexType prev_global_idx = 0;
		mesh.getLocalMesh().template forGhost< EntityDimension, Devices::Sequential >(
		[&] ( GlobalIndexType local_idx )
		{
		if( ! std::is_same< DeviceType, Devices::Cuda >::value )
		if( ! mesh.getLocalMesh().template isGhostEntity< EntityDimension >( local_idx ) )
		throw std::runtime_error( "encountered local entity while iterating over ghost entities - the mesh is probably inconsistent or there is a bug in the DistributedMeshSynchronizer" );
		const IndexType global_idx = hostGlobalIndices[ local_idx ];
		const GlobalIndexType global_idx = hostGlobalIndices[ local_idx ];
		if( global_idx < prev_global_idx )
		throw std::runtime_error( "ghost indices are not sorted - the mesh is probably inconsistent or there is a bug in the DistributedMeshSynchronizer" );
		prev_global_idx = global_idx;
		@@ -102,11 +88,13 @@ public:
		throw std::runtime_error( "the owner of a ghost entity cannot be the local rank - the mesh is probably inconsistent or there is a bug in the DistributedMeshSynchronizer" );
		++localGhostCounts[ owner ];
		}
		);
		}

		// exchange the ghost counts
		ghostEntitiesCounts.setDimensions( nproc, nproc );
		{
		Matrices::DenseMatrix< IndexType, Devices::Host, int > sendbuf;
		Matrices::DenseMatrix< GlobalIndexType, Devices::Host, int > sendbuf;
		sendbuf.setDimensions( nproc, nproc );
		// copy the local ghost counts into all rows of the sendbuf
		for( int j = 0; j < nproc; j++ )
		@@ -126,17 +114,15 @@ public:
		for( int i = 0; i < nproc; i++ )
		ghostNeighborOffsets[ i + 1 ] = ghostNeighborOffsets[ i ] + ghostEntitiesCounts( i, rank );

		// allocate ghost neighbors and send buffers
		// allocate ghost neighbors
		ghostNeighbors.setSize( ghostNeighborOffsets[ nproc ] );
		sendBuffers.setSize( ghostNeighborOffsets[ nproc ] );

		// send indices of ghost entities - set them as ghost neighbors on the
		// target rank
		// send indices of ghost entities - set them as ghost neighbors on the target rank
		{
		std::vector< typename CommunicatorType::Request > requests;

		// send our ghost indices to the neighboring ranks
		IndexType firstGhost = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >();
		GlobalIndexType firstGhost = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >();
		for( int i = 0; i < nproc; i++ ) {
		if( ghostEntitiesCounts( rank, i ) > 0 ) {
		requests.push_back( CommunicatorType::ISend(
		@@ -167,29 +153,18 @@ public:
		// convert received ghost indices from global to local
		ghostNeighbors -= ownStart;
		}

		#if 0
		CommunicatorType::Barrier();
		for( int i = 0; i < nproc; i++ ) {
		if( i == rank ) {
		std::cout << "rank = " << rank << "\n";
		// std::cout << "offsets = " << offsets << std::endl;
		std::cout << "local ghost counts = " << localGhostCounts << "\n";
		std::cout << "ghost entities counts matrix:\n" << ghostEntitiesCounts;
		std::cout << "global indices = " << mesh.template getGlobalIndices< EntityDimension >() << "\n";
		std::cout << "ghost offsets = " << ghostOffsets << "\n";
		std::cout << "ghost neighbors = " << ghostNeighbors << "\n";
		std::cout.flush();
		}
		CommunicatorType::Barrier();
		}
		#endif
		}

		template< typename MeshFunction >
		void synchronize( MeshFunction& function )
		{
		static_assert( MeshFunction::getEntitiesDimension() == EntityDimension,
		"the mesh function's entity dimension does not match" );
		static_assert( std::is_same< typename MeshFunction::MeshType, typename DistributedMesh::MeshType >::value,
		"The type of the mesh function's mesh does not match the local mesh." );
		TNL_ASSERT_EQ( function.getData().getSize(), localEntitiesCount,
		"The mesh function does not have the expected size." );
		using RealType = typename MeshFunction::RealType;

		// GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
		#ifdef HAVE_CUDA
		@@ -200,6 +175,9 @@ public:
		const int rank = CommunicatorType::GetRank( group );
		const int nproc = CommunicatorType::GetSize( group );

		// allocate send buffers (setSize does nothing if the array size is already correct)
		sendBuffers.setSize( ghostNeighborOffsets[ nproc ] * sizeof(RealType) );

		// buffer for asynchronous communication requests
		std::vector< typename CommunicatorType::Request > requests;

		@@ -213,23 +191,24 @@ public:
		}
		}

		auto sendBuffersView = sendBuffers.getView();
		Containers::ArrayView< RealType, DeviceType, GlobalIndexType > sendBuffersView;
		sendBuffersView.bind( reinterpret_cast<RealType*>( sendBuffers.getData() ), ghostNeighborOffsets[ nproc ] );
		const auto ghostNeighborsView = ghostNeighbors.getConstView();
		const auto functionDataView = function.getData().getConstView();
		auto copy_kernel = [sendBuffersView, functionDataView, ghostNeighborsView] __cuda_callable__ ( IndexType k, IndexType offset ) mutable
		auto copy_kernel = [sendBuffersView, functionDataView, ghostNeighborsView] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable
		{
		sendBuffersView[ offset + k ] = functionDataView[ ghostNeighborsView[ offset + k ] ];
		};

		for( int i = 0; i < nproc; i++ ) {
		if( ghostEntitiesCounts( i, rank ) > 0 ) {
		const IndexType offset = ghostNeighborOffsets[ i ];
		const GlobalIndexType offset = ghostNeighborOffsets[ i ];
		// copy data to send buffers
		Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset );
		Algorithms::ParallelFor< DeviceType >::exec( (GlobalIndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset );

		// issue async send operation
		requests.push_back( CommunicatorType::ISend(
		sendBuffers.getData() + ghostNeighborOffsets[ i ],
		sendBuffersView.getData() + ghostNeighborOffsets[ i ],
		ghostEntitiesCounts( i, rank ),
		i, 0, group ) );
		}
		@@ -242,7 +221,7 @@ public:
		protected:
		// count of local entities (including ghosts) - used only for asserts in the
		// synchronize method
		IndexType localEntitiesCount = 0;
		GlobalIndexType localEntitiesCount = 0;

		// GOTCHA (see above)
		int gpu_id = 0;
		@@ -260,14 +239,14 @@ protected:
		* - for the i-th rank, the i-th row determines the receive buffer sizes and
		* the i-th column determines the send buffer sizes
		*/
		Matrices::DenseMatrix< IndexType, Devices::Host, int > ghostEntitiesCounts;
		Matrices::DenseMatrix< GlobalIndexType, Devices::Host, int > ghostEntitiesCounts;

		/**
		* Ghost offsets: the i-th value is the local index of the first ghost
		* entity owned by the i-th rank. All ghost entities owned by the i-th
		* rank are assumed to be indexed contiguously in the local mesh.
		*/
		Containers::Array< IndexType, Devices::Host, int > ghostOffsets;
		Containers::Array< GlobalIndexType, Devices::Host, int > ghostOffsets;

		/**
		* Ghost neighbor offsets: array of size nproc + 1 where the i-th value is
		@@ -275,7 +254,7 @@ protected:
		* value is the size of the ghostNeighbors and sendBuffers arrays (see
		* below).
		*/
		Containers::Array< IndexType, Devices::Host, int > ghostNeighborOffsets;
		Containers::Array< GlobalIndexType, Devices::Host, int > ghostNeighborOffsets;

		/**
		* Ghost neighbor indices: array containing local indices of the entities
		@@ -286,15 +265,17 @@ protected:
		* ghost neighbor indices cannot be made contiguous in general so we need the
		* send buffers.
		*/
		Containers::Vector< IndexType, DeviceType, IndexType > ghostNeighbors;
		Containers::Vector< GlobalIndexType, DeviceType, GlobalIndexType > ghostNeighbors;

		/**
		* Send buffers: array for buffering the mesh function values which will be
		* sent to other ranks. The send buffer for the i-th rank is the part of the
		* array starting at index ghostNeighborOffsets[i] (inclusive) and ending at
		* index ghostNeighborOffsets[i+1] (exclusive).
		* sent to other ranks. We use std::uint8_t as the value type to make this
		* class independent of the mesh function's real type. When cast to the real
		* type values, the send buffer for the i-th rank is the part of the array
		* starting at index ghostNeighborOffsets[i] (inclusive) and ending at index
		* ghostNeighborOffsets[i+1] (exclusive).
		*/
		Containers::Array< RealType, DeviceType, IndexType > sendBuffers;
		Containers::Array< std::uint8_t, DeviceType, GlobalIndexType > sendBuffers;
		};

		} // namespace DistributedMeshes

src/TNL/Problems/HeatEquationProblem.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -104,7 +104,7 @@ class HeatEquationProblem : public PDEProblem< Mesh,

		protected:

		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< MeshFunctionType >;
		using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer< Meshes::DistributedMeshes::DistributedMesh< typename MeshFunctionType::MeshType > >;
		DistributedMeshSynchronizerType synchronizer;

		MeshFunctionPointer uPointer;