From dfb0351720a8464031a327292ab4e0ce8b9b7e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 1 Nov 2020 12:48:34 +0100 Subject: [PATCH 01/50] Merge the functionality of NoDistrCommunicator into MpiCommunicator When the program is compiled without MPI support, or when run without mpirun, the MpiCommunicator behaves just like NoDistrCommunicator. Hence, we will not need to separate between the two classes. --- src/TNL/Communicators/MpiCommunicator.h | 36 +++++++++---------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 1382fb7a6..dd119e813 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -165,8 +165,6 @@ class MpiCommunicator Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); } } -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -193,7 +191,7 @@ class MpiCommunicator MPI_Finalized(&finalized); return initialized && !finalized; #else - throw Exceptions::MPISupportMissing(); + return true; #endif } @@ -206,7 +204,7 @@ class MpiCommunicator MPI_Comm_rank(group,&rank); return rank; #else - throw Exceptions::MPISupportMissing(); + return 0; #endif } @@ -219,7 +217,7 @@ class MpiCommunicator MPI_Comm_size(group,&size); return size; #else - throw Exceptions::MPISupportMissing(); + return 1; #endif } @@ -252,7 +250,8 @@ class MpiCommunicator MPI_Dims_create(nproc, dim, distr); #else - throw Exceptions::MPISupportMissing(); + for(int i=0;i( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group ); -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -287,8 +282,6 @@ class MpiCommunicator TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup"); MPI_Status status; MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status ); -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -302,7 +295,7 @@ class MpiCommunicator MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req); return req; #else - throw Exceptions::MPISupportMissing(); + return 1; #endif } @@ -316,7 +309,7 @@ class MpiCommunicator MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req); return req; #else - throw Exceptions::MPISupportMissing(); + return 1; #endif } @@ -325,8 +318,6 @@ class MpiCommunicator #ifdef HAVE_MPI TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE); -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -337,8 +328,6 @@ class MpiCommunicator TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup"); MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group); -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -353,7 +342,7 @@ class MpiCommunicator TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group); #else - throw Exceptions::MPISupportMissing(); + memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) ); #endif } @@ -367,8 +356,6 @@ class MpiCommunicator #ifdef HAVE_MPI TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group); -#else - throw Exceptions::MPISupportMissing(); #endif } @@ -385,7 +372,7 @@ class MpiCommunicator TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup"); MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group); #else - throw Exceptions::MPISupportMissing(); + memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) ); #endif } @@ -437,7 +424,8 @@ class MpiCommunicator MPITypeResolver< T >::getType(), group ); #else - throw Exceptions::MPISupportMissing(); + TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." ); + memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) ); #endif } @@ -458,7 +446,7 @@ class MpiCommunicator else MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup); #else - throw Exceptions::MPISupportMissing(); + newGroup=oldGroup; #endif } -- GitLab From 3bddf4139bfa72f4bba3f4e865638757ec32fe0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 1 Nov 2020 12:58:55 +0100 Subject: [PATCH 02/50] Removed useless uses of NoDistrCommunicator --- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 5 -- .../tnl-benchmark-linear-solvers.h | 5 -- src/Benchmarks/ODESolvers/Euler.hpp | 2 - src/Benchmarks/ODESolvers/Merson.hpp | 2 - src/Benchmarks/ODESolvers/SimpleProblem.h | 3 +- .../ODESolvers/tnl-benchmark-ode-solvers.h | 5 -- src/TNL/Solvers/Linear/Traits.h | 4 +- src/TNL/Solvers/ODE/Merson_impl.h | 1 - src/TNL/Solvers/SolverInitiator.h | 2 +- src/TNL/Solvers/SolverInitiator_impl.h | 39 ++------- src/TNL/Solvers/SolverStarter_impl.h | 2 - src/Tools/tnl-init.h | 87 ++++++++----------- 12 files changed, 48 insertions(+), 109 deletions(-) diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index 683e6960a..b791b0100 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -39,11 +38,7 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index, using namespace TNL; using namespace TNL::Benchmarks; -#ifdef HAVE_MPI using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif template< typename Matrix, typename Vector > diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index e5a8d9819..cadb5a046 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -66,11 +65,7 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -#ifdef HAVE_MPI using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif static const std::set< std::string > valid_solvers = { diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp index ab975ed07..5039417b7 100644 --- a/src/Benchmarks/ODESolvers/Euler.hpp +++ b/src/Benchmarks/ODESolvers/Euler.hpp @@ -10,8 +10,6 @@ #pragma once -#include -#include #include "ComputeBlockResidue.h" namespace TNL { diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp index c97bfc236..1fd8f8a2b 100644 --- a/src/Benchmarks/ODESolvers/Merson.hpp +++ b/src/Benchmarks/ODESolvers/Merson.hpp @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include "Merson.h" diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h index ff81fd18e..122606a32 100644 --- a/src/Benchmarks/ODESolvers/SimpleProblem.h +++ b/src/Benchmarks/ODESolvers/SimpleProblem.h @@ -14,6 +14,7 @@ #include #include +#include namespace TNL { namespace Benchmarks { @@ -27,7 +28,7 @@ struct SimpleProblem using DeviceType = Device; using IndexType = Index; using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using CommunicatorType = Communicators::NoDistrCommunicator; + using CommunicatorType = Communicators::MpiCommunicator; template< typename VectorPointer > void getExplicitUpdate( const RealType& time, diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h index bbde88945..aa4370c7a 100644 --- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h +++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -39,11 +38,7 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -#ifdef HAVE_MPI using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif template< typename Real, typename Index > diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 9a5db2c40..5f93e0cde 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -12,7 +12,7 @@ #pragma once -#include +#include #include #include #include @@ -26,7 +26,7 @@ namespace Linear { template< typename Matrix > struct Traits { - using CommunicatorType = Communicators::NoDistrCommunicator; + using CommunicatorType = Communicators::MpiCommunicator; using VectorType = Containers::Vector < typename Matrix::RealType, diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 4c7b21bc9..82a6a87ff 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -14,7 +14,6 @@ #include #include #include -#include #include "Merson.h" diff --git a/src/TNL/Solvers/SolverInitiator.h b/src/TNL/Solvers/SolverInitiator.h index 0ba4dc55a..062857520 100644 --- a/src/TNL/Solvers/SolverInitiator.h +++ b/src/TNL/Solvers/SolverInitiator.h @@ -16,7 +16,7 @@ namespace TNL { namespace Solvers { -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter , typename CommunicatorType > class ProblemSetter, +template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, typename ConfigTag > class SolverInitiator { diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h index 16e0fd222..3d704426d 100644 --- a/src/TNL/Solvers/SolverInitiator_impl.h +++ b/src/TNL/Solvers/SolverInitiator_impl.h @@ -18,7 +18,6 @@ #include #include -#include #include namespace TNL { @@ -50,15 +49,6 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Device, typename Index, typename ConfigTag, - bool enabled = true > -class CommunicatorTypeResolver {}; - -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, - typename Real, - typename Device, - typename Index, - typename ConfigTag, - typename CommunicatorType, bool enabled = ConfigTagMeshResolve< ConfigTag >::enabled > class SolverInitiatorMeshResolver {}; @@ -169,7 +159,7 @@ class SolverInitiatorIndexResolver< ProblemSetter, Real, Device, Index, ConfigTa public: static bool run( const Config::ParameterContainer& parameters ) { - return CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::run( parameters ); + return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag >::run( parameters ); } }; @@ -178,28 +168,12 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Device, typename Index, typename ConfigTag > -class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true > -{ - public: - static bool run( const Config::ParameterContainer& parameters ) - { - if( Communicators::MpiCommunicator::isDistributed() ) - return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters ); - return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters ); - } -}; - -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, - typename Real, - typename Device, - typename Index, - typename ConfigTag, - typename CommunicatorType > -class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, CommunicatorType, false > +class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, false > { public: static bool run( const Config::ParameterContainer& parameters ) { + using CommunicatorType = Communicators::MpiCommunicator; return ProblemSetter< Real, Device, Index, @@ -213,10 +187,11 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Real, typename Device, typename Index, - typename ConfigTag, - typename CommunicatorType > -class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag,CommunicatorType, true > + typename ConfigTag > +class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, true > { + using CommunicatorType = Communicators::MpiCommunicator; + // wrapper for MeshTypeResolver template< typename MeshType > using ProblemSetterWrapper = ProblemSetter< Real, Device, Index, MeshType, ConfigTag, SolverStarter< ConfigTag >, CommunicatorType >; diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h index d2bbd8159..fa1d23951 100644 --- a/src/TNL/Solvers/SolverStarter_impl.h +++ b/src/TNL/Solvers/SolverStarter_impl.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -66,7 +65,6 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame */ if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! Communicators::NoDistrCommunicator::setup( parameters ) || ! Communicators::MpiCommunicator::setup( parameters ) ) return false; diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h index a0d171f14..8a4024ac6 100644 --- a/src/Tools/tnl-init.h +++ b/src/Tools/tnl-init.h @@ -21,19 +21,18 @@ #include #include -#include #include using namespace TNL; template< typename MeshType, typename RealType, - typename CommunicatorType, int xDiff, int yDiff, int zDiff > bool renderFunction( const Config::ParameterContainer& parameters ) { + using CommunicatorType = Communicators::MpiCommunicator; using namespace Meshes::DistributedMeshes; using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh; @@ -130,20 +129,6 @@ bool renderFunction( const Config::ParameterContainer& parameters ) return true; } -template< typename MeshType, - typename RealType, - int xDiff, - int yDiff, - int zDiff > -bool resolveCommunicator( const Config::ParameterContainer& parameters ) -{ -#ifdef HAVE_MPI - if( Communicators::MpiCommunicator::isDistributed() ) - return renderFunction(parameters); -#endif - return renderFunction(parameters); -} - template< typename MeshType, typename RealType > bool resolveDerivatives( const Config::ParameterContainer& parameters ) @@ -160,75 +145,75 @@ bool resolveDerivatives( const Config::ParameterContainer& parameters ) return false; } if( xDiff == 0 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 0 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 1 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 2 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 3 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 3 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 4 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 4 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 4 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 0 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 1 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 2 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 3 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 3 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 0 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 1 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 2 >( parameters ); if( xDiff == 0 && yDiff == 3 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 3, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 3, 0 >( parameters ); if( xDiff == 0 && yDiff == 3 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 3, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 3, 1 >( parameters ); if( xDiff == 0 && yDiff == 4 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 4, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 4, 0 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 0 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 1 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 2 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 3 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 3 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 0 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 1 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 2 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 2 >( parameters ); if( xDiff == 1 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 2, 0 >( parameters ); if( xDiff == 1 && yDiff == 2 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 2, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 2, 1 >( parameters ); if( xDiff == 1 && yDiff == 3 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 3, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 3, 0 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 0 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 1 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 2 >( parameters ); if( xDiff == 2 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 1, 0 >( parameters ); if( xDiff == 2 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 2, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 2, 1, 1 >( parameters ); if( xDiff == 2 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 2, 0 >( parameters ); if( xDiff == 3 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 3, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 3, 0, 0 >( parameters ); if( xDiff == 3 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 3, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 3, 0, 1 >( parameters ); if( xDiff == 3 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 3, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 3, 1, 0 >( parameters ); if( xDiff == 4 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 4, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 4, 0, 0 >( parameters ); return false; } -- GitLab From fc2a84a72460b5f39e026682352060b07b17ed36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 1 Nov 2020 13:40:00 +0100 Subject: [PATCH 03/50] Replaced NoDistrCommunicator in unit tests with MpiCommunicator and separate run without mpirun --- src/UnitTests/Containers/CMakeLists.txt | 9 ++++++++ .../Containers/DistributedArrayTest.h | 7 ++---- .../Containers/DistributedVectorTest.h | 7 ++---- .../Containers/VectorBinaryOperationsTest.h | 22 ++----------------- .../Containers/VectorUnaryOperationsTest.h | 11 ++-------- .../Containers/VectorVerticalOperationsTest.h | 11 ++-------- .../Containers/ndarray/CMakeLists.txt | 4 ++++ .../DistributedNDArrayOverlaps_1D_test.h | 15 ------------- .../DistributedNDArrayOverlaps_semi1D_test.h | 1 - .../ndarray/DistributedNDArray_1D_test.h | 15 ++----------- .../ndarray/DistributedNDArray_semi1D_test.h | 3 +-- src/UnitTests/Matrices/CMakeLists.txt | 1 + .../Matrices/DistributedMatrixTest.h | 7 ++---- .../DistributedMeshes/CutMeshFunctionTest.cpp | 8 +++---- 14 files changed, 33 insertions(+), 88 deletions(-) diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt index fdde0a8b7..efba5e50d 100644 --- a/src/UnitTests/Containers/CMakeLists.txt +++ b/src/UnitTests/Containers/CMakeLists.txt @@ -92,30 +92,39 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorUnaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorUnaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorVerticalOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) if( BUILD_CUDA ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) endif() endif() diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index 204bc6fe7..097a60d26 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -10,7 +10,6 @@ #include #include -#include #include #include @@ -59,12 +58,10 @@ protected: // types for which DistributedArrayTest is instantiated using DistributedArrayTypes = ::testing::Types< - DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedArray< double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator > #ifdef HAVE_CUDA , - DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedArray< double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator > #endif >; diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index 2a1834f31..1d727aef6 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -69,12 +68,10 @@ protected: // types for which DistributedVectorTest is instantiated using DistributedVectorTypes = ::testing::Types< - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator > #ifdef HAVE_CUDA , - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator > #endif >; diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index d09798453..7f81d87f5 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -14,7 +14,6 @@ #if defined(DISTRIBUTED_VECTOR) #include - #include #include #include #include @@ -154,16 +153,7 @@ protected: Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, DistributedVector< short, Devices::Host, int, Communicators::MpiCommunicator > >, Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >, - - Pair< DistributedVector< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVector< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > > + DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > > #else Pair< DistributedVector< int, Devices::Cuda, int, Communicators::MpiCommunicator >, DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, @@ -172,15 +162,7 @@ protected: Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > > + DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index a5beb58d9..867adb069 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -14,7 +14,6 @@ #if defined(DISTRIBUTED_VECTOR) #include - #include #include #include #include @@ -70,17 +69,11 @@ protected: #ifndef HAVE_CUDA DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator > #else DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index 3aa60e612..ac7fa79d6 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -14,7 +14,6 @@ #if defined(DISTRIBUTED_VECTOR) #include - #include #include #include #include @@ -106,17 +105,11 @@ protected: #ifndef HAVE_CUDA DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator > #else DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt index 5be285b5e..f5fb11bdf 100644 --- a/src/UnitTests/Containers/ndarray/CMakeLists.txt +++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt @@ -58,13 +58,17 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArray_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArray_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) endif() diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h index 7377cbff2..113d1daa3 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -74,13 +73,6 @@ using DistributedNDArrayTypes = ::testing::Types< Devices::Host >, Communicators::MpiCommunicator, std::index_sequence< 2 > > -// TODO: does it make sense for NoDistrCommunicator? -// DistributedNDArray< NDArray< double, -// SizesHolder< int, 0 >, -// std::index_sequence< 0 >, -// Devices::Host >, -// Communicators::NoDistrCommunicator, -// std::index_sequence< 2 > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, @@ -89,13 +81,6 @@ using DistributedNDArrayTypes = ::testing::Types< Devices::Cuda >, Communicators::MpiCommunicator, std::index_sequence< 2 > > -// TODO: does it make sense for NoDistrCommunicator? -// DistributedNDArray< NDArray< double, -// SizesHolder< int, 0 >, -// std::index_sequence< 0 >, -// Devices::Cuda >, -// Communicators::NoDistrCommunicator, -// std::index_sequence< 2 > > #endif >; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h index f1ac970eb..145b0db5b 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h index a8d3bcdab..d80e467f5 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -70,24 +69,14 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Host >, - Communicators::MpiCommunicator >, - DistributedNDArray< NDArray< double, - SizesHolder< int, 0 >, - std::index_sequence< 0 >, - Devices::Host >, - Communicators::NoDistrCommunicator > + Communicators::MpiCommunicator > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Cuda >, - Communicators::MpiCommunicator >, - DistributedNDArray< NDArray< double, - SizesHolder< int, 0 >, - std::index_sequence< 0 >, - Devices::Cuda >, - Communicators::NoDistrCommunicator > + Communicators::MpiCommunicator > #endif >; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h index 6f777c215..a072b2e80 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -77,7 +76,7 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Cuda >, - Communicators::NoDistrCommunicator > + Communicators::MpiCommunicator > #endif >; diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 65723ac88..b713c8f0c 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -58,4 +58,5 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedMatrixTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedMatrixTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" ) endif() diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h index 30a76f86a..ea5a7e582 100644 --- a/src/UnitTests/Matrices/DistributedMatrixTest.h +++ b/src/UnitTests/Matrices/DistributedMatrixTest.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include @@ -101,12 +100,10 @@ protected: // types for which DistributedMatrixTest is instantiated using DistributedMatrixTypes = ::testing::Types< - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >, - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator > #ifdef HAVE_CUDA , - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >, - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator > #endif >; diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp index 5d034087f..640aa5180 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "../../Functions/Functions.h" @@ -53,7 +53,7 @@ TEST(CutMeshFunction, 2D) //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(0), StaticVector<1,int>(1), @@ -116,7 +116,7 @@ TEST(CutMeshFunction, 3D_1) //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<2,int>(0,2), @@ -179,7 +179,7 @@ TEST(CutMeshFunction, 3D_2) //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(2,1), StaticVector<1,int>(0), -- GitLab From 1373faf57f97642050f6a61051f91bc171f3e82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 1 Nov 2020 13:41:39 +0100 Subject: [PATCH 04/50] Removed NoDistrCommunicator which is now unused --- src/TNL/Communicators/NoDistrCommunicator.h | 149 -------------------- 1 file changed, 149 deletions(-) delete mode 100644 src/TNL/Communicators/NoDistrCommunicator.h diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h deleted file mode 100644 index c0d89015b..000000000 --- a/src/TNL/Communicators/NoDistrCommunicator.h +++ /dev/null @@ -1,149 +0,0 @@ -/*************************************************************************** - NoDistrCommunicator.h - description - ------------------- - begin : Jan 9, 2018 - copyright : (C) 2018 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { -namespace Communicators { - -//! \brief Dummy communicator without any distribution support. -class NoDistrCommunicator -{ - public: - using Request = int; - using CommunicationGroup = int; - static constexpr Request NullRequest = -1; - static constexpr CommunicationGroup AllGroup = 1; - static constexpr CommunicationGroup NullGroup = 0; - - static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ){}; - - static bool setup( const Config::ParameterContainer& parameters, - const String& prefix = "" ) - { - return true; - } - - static void Init(int& argc, char**& argv) {} - - static void setupRedirection(){} - - static void Finalize(){} - - static bool IsInitialized() - { - return true; - } - - static bool isDistributed() - { - return false; - } - - static int GetRank(CommunicationGroup group = AllGroup ) - { - return 0; - } - - static int GetSize(CommunicationGroup group = AllGroup ) - { - return 1; - } - - static void DimsCreate(int nproc, int dim, int *distr) - { - for(int i=0;i - static Request ISend( const T *data, int count, int dest, int tag, CommunicationGroup group) - { - return 1; - } - - template - static Request IRecv( const T *data, int count, int src, int tag, CommunicationGroup group) - { - return 1; - } - - static void WaitAll(Request *reqs, int length) - { - } - - template< typename T > - static void Bcast( T* data, int count, int root, CommunicationGroup group) - { - } - - template< typename T > - static void Allreduce( const T* data, - T* reduced_data, - int count, - const MPI_Op &op, - CommunicationGroup group ) - { - memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) ); - } - - // in-place variant of Allreduce - template< typename T > - static void Allreduce( T* data, - int count, - const MPI_Op &op, - CommunicationGroup group ) - { - } - - template< typename T > - static void Reduce( T* data, - T* reduced_data, - int count, - MPI_Op &op, - int root, - CommunicationGroup group ) - { - memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) ); - } - - template< typename T > - static void Alltoall( const T* sendData, - int sendCount, - T* receiveData, - int receiveCount, - CommunicationGroup group ) - { - TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount for NoDistrCommunicator." ); - memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) ); - } - - static void CreateNewGroup(bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup) - { - newGroup=oldGroup; - } - - static void writeProlog( Logger& logger ) - { - } -}; - -} // namespace Communicators -} // namespace TNL -- GitLab From f30d68cfa09bc8e43de034a989fcfac6712865c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 2 Nov 2020 20:59:18 +0100 Subject: [PATCH 05/50] PVTUReader: added methods readLocalPointData and readLocalCellData --- src/TNL/Meshes/Readers/PVTUReader.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h index 666aa4f45..4bb8ba7eb 100644 --- a/src/TNL/Meshes/Readers/PVTUReader.h +++ b/src/TNL/Meshes/Readers/PVTUReader.h @@ -211,6 +211,18 @@ public: mesh.setCommunicationGroup( group ); } + VariantVector + readLocalPointData( std::string arrayName ) + { + return localReader.readPointData( arrayName ); + } + + VariantVector + readLocalCellData( std::string arrayName ) + { + return localReader.readCellData( arrayName ); + } + virtual void reset() override { resetBase(); -- GitLab From d798788eddd21bfbfab6dda2acf47fb21d56a4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 2 Nov 2020 21:17:37 +0100 Subject: [PATCH 06/50] pytnl: added bindings for DistributedMesh and PVTU mesh reader --- src/Python/pytnl/CMakeLists.txt | 3 ++ src/Python/pytnl/tnl/CMakeLists.txt | 1 + src/Python/pytnl/tnl/Mesh.cpp | 25 --------- src/Python/pytnl/tnl/MeshReaders.cpp | 37 +++++++++++++ src/Python/pytnl/tnl/MeshReaders.h | 47 ++++++++++++++++ src/Python/pytnl/tnl/tnl.cpp | 2 + src/Python/pytnl/tnl_conversions.h | 1 + src/Python/pytnl/tnl_mpi/CMakeLists.txt | 54 +++++++++++++++++++ src/Python/pytnl/tnl_mpi/DistributedMesh.cpp | 20 +++++++ src/Python/pytnl/tnl_mpi/DistributedMesh.h | 34 ++++++++++++ .../pytnl/tnl_mpi/DistributedMeshReaders.cpp | 26 +++++++++ src/Python/pytnl/tnl_mpi/tnl_mpi.cpp | 42 +++++++++++++++ src/Python/pytnl/typedefs.h | 5 ++ src/Python/pytnl/variant_caster.h | 15 ++++++ 14 files changed, 287 insertions(+), 25 deletions(-) create mode 100644 src/Python/pytnl/tnl/MeshReaders.cpp create mode 100644 src/Python/pytnl/tnl/MeshReaders.h create mode 100644 src/Python/pytnl/tnl_mpi/CMakeLists.txt create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMesh.cpp create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMesh.h create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp create mode 100644 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp create mode 100644 src/Python/pytnl/variant_caster.h diff --git a/src/Python/pytnl/CMakeLists.txt b/src/Python/pytnl/CMakeLists.txt index 2065b0a13..15b8e6b0a 100644 --- a/src/Python/pytnl/CMakeLists.txt +++ b/src/Python/pytnl/CMakeLists.txt @@ -1,4 +1,7 @@ add_subdirectory( tnl ) +if( BUILD_MPI ) + add_subdirectory( tnl_mpi ) +endif() install( DIRECTORY . DESTINATION "include/pytnl" MESSAGE_NEVER diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt index c7fcd80e2..9c95d6326 100644 --- a/src/Python/pytnl/tnl/CMakeLists.txt +++ b/src/Python/pytnl/tnl/CMakeLists.txt @@ -6,6 +6,7 @@ set( sources Grid2D.cpp Grid3D.cpp Mesh.cpp + MeshReaders.cpp Object.cpp SparseMatrix.cpp String.cpp diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp index aa0c8c035..a3e582680 100644 --- a/src/Python/pytnl/tnl/Mesh.cpp +++ b/src/Python/pytnl/tnl/Mesh.cpp @@ -2,35 +2,10 @@ #include "../tnl_conversions.h" #include "Mesh.h" -#include -#include - -template< typename Reader > -void export_reader( py::module & m, const char* name ) -{ - py::class_< Reader >( m, name ) - .def(py::init()) - .def("loadMesh", &Reader::template loadMesh< MeshOfEdges >) - .def("loadMesh", &Reader::template loadMesh< MeshOfTriangles >) - .def("loadMesh", &Reader::template loadMesh< MeshOfTetrahedrons >) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfEdges & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTriangles & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTetrahedrons & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) - ; -} void export_Meshes( py::module & m ) { export_Mesh< MeshOfEdges >( m, "MeshOfEdges" ); export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" ); export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" ); - - export_reader< TNL::Meshes::Readers::VTKReader >( m, "VTKReader" ); - export_reader< TNL::Meshes::Readers::VTUReader >( m, "VTUReader" ); } diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp new file mode 100644 index 000000000..d47ec5268 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshReaders.cpp @@ -0,0 +1,37 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "MeshReaders.h" +#include "../typedefs.h" + +void export_MeshReaders( py::module & m ) +{ + using MeshReader = TNL::Meshes::Readers::MeshReader; + using XMLVTK = TNL::Meshes::Readers::XMLVTK; + + // base class with trampolines for virtual methods + py::class_< MeshReader, PyMeshReader >( m, "MeshReader" ) + .def(py::init()) + // bindings against the actual class, NOT the trampoline + .def("reset", &MeshReader::reset) + .def("detectMesh", &MeshReader::detectMesh) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >) + ; + + py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" ) + .def(py::init()) + ; + + // base class for VTUReader and PVTUReader + py::class_< XMLVTK, PyXMLVTK, MeshReader >( m, "XMLVTK" ) + .def(py::init()) + .def("readPointData", &XMLVTK::readPointData) + .def("readCellData", &XMLVTK::readCellData) + ; + + py::class_< TNL::Meshes::Readers::VTUReader, XMLVTK >( m, "VTUReader" ) + .def(py::init()) + ; +} diff --git a/src/Python/pytnl/tnl/MeshReaders.h b/src/Python/pytnl/tnl/MeshReaders.h new file mode 100644 index 000000000..22b40a671 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshReaders.h @@ -0,0 +1,47 @@ +#include +#include + +// trampoline classes needed for overriding virtual methods +// https://pybind11.readthedocs.io/en/stable/advanced/classes.html + +class PyMeshReader +: public TNL::Meshes::Readers::MeshReader +{ + using Parent = TNL::Meshes::Readers::MeshReader; + +public: + // inherit constructors + using TNL::Meshes::Readers::MeshReader::MeshReader; + + // trampolines (one for each virtual method) + void reset() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, reset ); + } + + void detectMesh() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh ); + } +}; + +class PyXMLVTK +: public TNL::Meshes::Readers::XMLVTK +{ + using Parent = TNL::Meshes::Readers::XMLVTK; + +public: + // inherit constructors + using TNL::Meshes::Readers::XMLVTK::XMLVTK; + + // trampolines (one for each virtual method) + void reset() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, reset ); + } + + void detectMesh() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh ); + } +}; diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp index 0eb7c3e8b..b60b98d68 100644 --- a/src/Python/pytnl/tnl/tnl.cpp +++ b/src/Python/pytnl/tnl/tnl.cpp @@ -14,6 +14,7 @@ void export_Grid1D( py::module & m ); void export_Grid2D( py::module & m ); void export_Grid3D( py::module & m ); void export_Meshes( py::module & m ); +void export_MeshReaders( py::module & m ); void export_SparseMatrices( py::module & m ); template< typename T > @@ -42,6 +43,7 @@ PYBIND11_MODULE(tnl, m) export_Grid3D(m); export_Meshes(m); + export_MeshReaders(m); export_SparseMatrices(m); } diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h index 602d1cffd..e942db324 100644 --- a/src/Python/pytnl/tnl_conversions.h +++ b/src/Python/pytnl/tnl_conversions.h @@ -1,3 +1,4 @@ // conversion has to be registered for each object file #include "tnl_str_conversion.h" #include "tnl_tuple_conversion.h" +#include "variant_caster.h" diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt new file mode 100644 index 000000000..8ea9a70c1 --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt @@ -0,0 +1,54 @@ +# enable C++14 for pytnl_mpi (due to py::overload_cast) +set(PYBIND11_CPP_STANDARD -std=c++14) + +set( sources + DistributedMesh.cpp + DistributedMeshReaders.cpp + tnl_mpi.cpp +) +pybind11_add_module( pytnl_mpi ${sources} ) + +# rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so +set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi ) + +# Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for +# a specific architecture is not very useful and prevents using Python tools on +# hybrid clusters. +get_target_property( pytnl_mpi_COMPILE_OPTIONS pytnl_mpi COMPILE_OPTIONS ) +if( pytnl_mpi_COMPILE_OPTIONS ) + string( REPLACE "-march=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) + string( REPLACE "-mtune=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) + set_target_properties( pytnl_mpi PROPERTIES COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) +endif() + +# We have bindings for unsafe objects (e.g. Array::operator[]) where assertion +# is the only safeguard, so we need to translate the TNL::AssertionError to +# Python's AssertionError. +# NDEBUG is defined in the global CMAKE_CXX_FLAGS and cannot be easily removed +# per-target, so we need to undefine it by passing -U NDEBUG. +target_compile_options( pytnl_mpi PRIVATE -U NDEBUG -D TNL_THROW_ASSERTION_ERROR ) + +# disable errors due to -Wunused-value coming from pybind11 +if( ${WITH_CI_FLAGS} ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options( pytnl_mpi PRIVATE -Wno-error=unused-value ) + endif() +endif() + + +# enable zlib and tinyxml2 (used by PVTUReader) +find_package( ZLIB ) +if( ZLIB_FOUND ) + target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_ZLIB") + target_include_directories(pytnl_mpi PUBLIC ${ZLIB_INCLUDE_DIRS}) + target_link_libraries(pytnl_mpi PUBLIC ${ZLIB_LIBRARIES}) +endif() + +find_package( tinyxml2 QUIET ) +if( tinyxml2_FOUND ) + target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_TINYXML2") + target_link_libraries(pytnl_mpi PUBLIC tinyxml2::tinyxml2) +endif() + + +install( TARGETS pytnl_mpi DESTINATION ${PYTHON_SITE_PACKAGES_DIR} ) diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp new file mode 100644 index 000000000..253c1a9d4 --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp @@ -0,0 +1,20 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../typedefs.h" +#include "DistributedMesh.h" +#include "../tnl/Array.h" + +void export_DistributedMeshes( py::module & m ) +{ + // make sure that bindings for the local meshes are available + py::module_::import("tnl"); + + export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" ); + export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" ); + export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" ); + + // export VTKTypesArrayType + using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType; + export_Array< VTKTypesArrayType >(m, "VTKTypesArrayType"); +} diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.h b/src/Python/pytnl/tnl_mpi/DistributedMesh.h new file mode 100644 index 000000000..64afe5978 --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.h @@ -0,0 +1,34 @@ +#pragma once + +#include +namespace py = pybind11; + +template< typename Mesh > +void export_DistributedMesh( py::module & m, const char* name ) +{ + auto mesh = py::class_< Mesh >( m, name ) + .def(py::init<>()) + .def_static("getMeshDimension", &Mesh::getMeshDimension) +// .def("setCommunicationGroup", &Mesh::setCommunicationGroup) +// .def("getCommunicationGroup", &Mesh::getCommunicationGroup) + .def("getLocalMesh", py::overload_cast<>(&Mesh::getLocalMesh), py::return_value_policy::reference_internal) + .def("setGhostLevels", &Mesh::setGhostLevels) + .def("getGhostLevels", &Mesh::getGhostLevels) + .def("getGlobalPointIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& { + return mesh.template getGlobalIndices< 0 >(); + }, + py::return_value_policy::reference_internal) + .def("getGlobalCellIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& { + return mesh.template getGlobalIndices< Mesh::getMeshDimension() >(); + }, + py::return_value_policy::reference_internal) + .def("vtkPointGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& { + return mesh.vtkPointGhostTypes(); + }, + py::return_value_policy::reference_internal) + .def("vtkCellGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& { + return mesh.vtkCellGhostTypes(); + }, + py::return_value_policy::reference_internal) + ; +} diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp new file mode 100644 index 000000000..bb902b5bc --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp @@ -0,0 +1,26 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../tnl/MeshReaders.h" +#include "../typedefs.h" + +#include + +void export_DistributedMeshReaders( py::module & m ) +{ + using XMLVTK = TNL::Meshes::Readers::XMLVTK; + using PVTUReader = TNL::Meshes::Readers::PVTUReader; + + // make sure that bindings for the parent class are available + py::module_::import("tnl"); + + py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" ) + .def(py::init()) + // loadMesh is not virtual in PVTUReader + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >) + .def("readLocalPointData", &PVTUReader::readLocalPointData) + .def("readLocalCellData", &PVTUReader::readLocalCellData) + ; +} diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp new file mode 100644 index 000000000..6d9986a7a --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -0,0 +1,42 @@ +#include "../exceptions.h" +#include "../typedefs.h" + +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +// external functions +void export_DistributedMeshes( py::module & m ); +void export_DistributedMeshReaders( py::module & m ); + +#include + +// Python module definition +PYBIND11_MODULE(tnl_mpi, m) +{ + register_exceptions(m); + + // MPI initialization and finalization + // https://stackoverflow.com/q/64647846 + if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) { + int argc = 0; + char** argv = nullptr; + TNL::Communicators::MpiCommunicator::Init( argc, argv ); + } + // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors + auto cleanup_callback = []() { + if( TNL::Communicators::MpiCommunicator::IsInitialized() ) + TNL::Communicators::MpiCommunicator::Finalize(); + }; + m.add_object("_cleanup", py::capsule(cleanup_callback)); + + // bindings for distributed data structures + export_DistributedMeshes(m); + export_DistributedMeshReaders(m); + + // bindings for functions + using TNL::Meshes::DistributedMeshes::distributeSubentities; + m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) { + distributeSubentities< 1 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) { + distributeSubentities< 2 >( mesh ); }); +} diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h index 7a74237f0..94e0de5f7 100644 --- a/src/Python/pytnl/typedefs.h +++ b/src/Python/pytnl/typedefs.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -37,3 +38,7 @@ using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< RealType, IndexType, LocalIndexType > >; + +using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >; +using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >; +using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >; diff --git a/src/Python/pytnl/variant_caster.h b/src/Python/pytnl/variant_caster.h new file mode 100644 index 000000000..c032448b5 --- /dev/null +++ b/src/Python/pytnl/variant_caster.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include // backport of std::variant from C++17 + +namespace pybind11 { namespace detail { + +// add specialization for concrete variant type +// (variant_caster is implemented in pybind11 and used for C++17's std::variant casting) +template struct type_caster> + : variant_caster> {}; + +}} // namespace pybind11::detail -- GitLab From 51f77b2f597c210191091409cf1e5ecd20b8bdec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 4 Nov 2020 15:31:16 +0100 Subject: [PATCH 07/50] pytnl: set proper debug postfix and pybind11 module names Python cannot easily import modules containing "-" so we use "_dbg" instead of "-dbg". --- src/Python/pytnl/tnl/CMakeLists.txt | 7 ++++++- src/Python/pytnl/tnl/tnl.cpp | 2 +- src/Python/pytnl/tnl_mpi/CMakeLists.txt | 7 ++++++- src/Python/pytnl/tnl_mpi/DistributedMesh.cpp | 2 +- .../pytnl/tnl_mpi/DistributedMeshReaders.cpp | 2 +- src/Python/pytnl/tnl_mpi/tnl_mpi.cpp | 2 +- src/Python/pytnl/typedefs.h | 14 ++++++++++++++ 7 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt index 9c95d6326..d06a4d16e 100644 --- a/src/Python/pytnl/tnl/CMakeLists.txt +++ b/src/Python/pytnl/tnl/CMakeLists.txt @@ -15,7 +15,12 @@ set( sources pybind11_add_module( pytnl ${sources} ) # rename the shared library to tnl.cpython-XXm-x86_64-linux-gnu.so -set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl ) +set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl DEBUG_POSTFIX "_dbg" ) + +# indicate the postfix to the target so that the pybind11 module name can be set accordingly +if( CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options( pytnl PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg ) +endif() # Skip -march=native -mtune=native for pytnl - optimizing python bindings for # a specific architecture is not very useful and prevents using Python tools on diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp index b60b98d68..75bd08421 100644 --- a/src/Python/pytnl/tnl/tnl.cpp +++ b/src/Python/pytnl/tnl/tnl.cpp @@ -24,7 +24,7 @@ template< typename T > using _vector = TNL::Containers::Vector< T, TNL::Devices::Host, IndexType >; // Python module definition -PYBIND11_MODULE(tnl, m) +PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m) { register_exceptions(m); diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt index 8ea9a70c1..ee5e9cc32 100644 --- a/src/Python/pytnl/tnl_mpi/CMakeLists.txt +++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt @@ -9,7 +9,12 @@ set( sources pybind11_add_module( pytnl_mpi ${sources} ) # rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so -set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi ) +set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi DEBUG_POSTFIX "_dbg" ) + +# indicate the postfix to the target so that the pybind11 module name can be set accordingly +if( CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options( pytnl_mpi PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg ) +endif() # Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for # a specific architecture is not very useful and prevents using Python tools on diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp index 253c1a9d4..03ee3692e 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp @@ -8,7 +8,7 @@ void export_DistributedMeshes( py::module & m ) { // make sure that bindings for the local meshes are available - py::module_::import("tnl"); + py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl))); export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" ); export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" ); diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp index bb902b5bc..7847e340b 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp @@ -12,7 +12,7 @@ void export_DistributedMeshReaders( py::module & m ) using PVTUReader = TNL::Meshes::Readers::PVTUReader; // make sure that bindings for the parent class are available - py::module_::import("tnl"); + py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl))); py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" ) .def(py::init()) diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp index 6d9986a7a..de2359ac2 100644 --- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -11,7 +11,7 @@ void export_DistributedMeshReaders( py::module & m ); #include // Python module definition -PYBIND11_MODULE(tnl_mpi, m) +PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m) { register_exceptions(m); diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h index 94e0de5f7..ac4b6bd83 100644 --- a/src/Python/pytnl/typedefs.h +++ b/src/Python/pytnl/typedefs.h @@ -1,5 +1,19 @@ #pragma once +// helper macros (the _NX variants are needed to expand macros in the arguments) +#define PYTNL_STRINGIFY(U) PYTNL_STRINGIFY_NX(U) +#define PYTNL_STRINGIFY_NX(U) #U + +#define PYTNL_PPCAT(A, B) PYTNL_PPCAT_NX(A, B) +#define PYTNL_PPCAT_NX(A, B) A ## B + +// the Python module name depends on the build type, this macro can be used to concatenate with the correct suffix +#ifdef PYTNL_MODULE_POSTFIX + #define PYTNL_MODULE_NAME(name) PYTNL_PPCAT(name, PYTNL_MODULE_POSTFIX) +#else + #define PYTNL_MODULE_NAME(name) name +#endif + #include #include #include -- GitLab From b8fa05a2d6fc10fa75fb4bd4723fa62d581e408e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 4 Nov 2020 15:34:53 +0100 Subject: [PATCH 08/50] pytnl: added bindings for VTKTraits --- src/Python/pytnl/tnl/CMakeLists.txt | 1 + src/Python/pytnl/tnl/VTKTraits.cpp | 45 +++++++++++++++++++++++++++++ src/Python/pytnl/tnl/tnl.cpp | 3 ++ src/TNL/Meshes/VTKTraits.h | 6 ++-- 4 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 src/Python/pytnl/tnl/VTKTraits.cpp diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt index d06a4d16e..34fe9b179 100644 --- a/src/Python/pytnl/tnl/CMakeLists.txt +++ b/src/Python/pytnl/tnl/CMakeLists.txt @@ -10,6 +10,7 @@ set( sources Object.cpp SparseMatrix.cpp String.cpp + VTKTraits.cpp tnl.cpp ) pybind11_add_module( pytnl ${sources} ) diff --git a/src/Python/pytnl/tnl/VTKTraits.cpp b/src/Python/pytnl/tnl/VTKTraits.cpp new file mode 100644 index 000000000..85d796471 --- /dev/null +++ b/src/Python/pytnl/tnl/VTKTraits.cpp @@ -0,0 +1,45 @@ +#include +namespace py = pybind11; + +#include + +void export_VTKTraits( py::module & m ) +{ + py::enum_< TNL::Meshes::VTK::FileFormat >( m, "VTKFileFormat") + .value("ascii", TNL::Meshes::VTK::FileFormat::ascii) + .value("binary", TNL::Meshes::VTK::FileFormat::binary) + .value("zlib_compressed", TNL::Meshes::VTK::FileFormat::zlib_compressed) + ; + py::enum_< TNL::Meshes::VTK::DataType >( m, "VTKDataType") + .value("CellData", TNL::Meshes::VTK::DataType::CellData) + .value("PointData", TNL::Meshes::VTK::DataType::PointData) + ; + py::enum_< TNL::Meshes::VTK::EntityShape >( m, "VTKEntityShape") + .value("Vertex", TNL::Meshes::VTK::EntityShape::Vertex) + .value("PolyVertex", TNL::Meshes::VTK::EntityShape::PolyVertex) + .value("Line", TNL::Meshes::VTK::EntityShape::Line) + .value("PolyLine", TNL::Meshes::VTK::EntityShape::PolyLine) + .value("Triangle", TNL::Meshes::VTK::EntityShape::Triangle) + .value("TriangleStrip", TNL::Meshes::VTK::EntityShape::TriangleStrip) + .value("Polygon", TNL::Meshes::VTK::EntityShape::Polygon) + .value("Pixel", TNL::Meshes::VTK::EntityShape::Pixel) + .value("Quad", TNL::Meshes::VTK::EntityShape::Quad) + .value("Tetra", TNL::Meshes::VTK::EntityShape::Tetra) + .value("Voxel", TNL::Meshes::VTK::EntityShape::Voxel) + .value("Hexahedron", TNL::Meshes::VTK::EntityShape::Hexahedron) + .value("Wedge", TNL::Meshes::VTK::EntityShape::Wedge) + .value("Pyramid", TNL::Meshes::VTK::EntityShape::Pyramid) + ; + py::enum_< TNL::Meshes::VTK::CellGhostTypes >( m, "VTKCellGhostTypes") + .value("DUPLICATECELL", TNL::Meshes::VTK::CellGhostTypes::DUPLICATECELL, "the cell is present on multiple processors") + .value("HIGHCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::HIGHCONNECTIVITYCELL, "the cell has more neighbors than in a regular mesh") + .value("LOWCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::LOWCONNECTIVITYCELL, "the cell has less neighbors than in a regular mesh") + .value("REFINEDCELL", TNL::Meshes::VTK::CellGhostTypes::REFINEDCELL, "other cells are present that refines it") + .value("EXTERIORCELL", TNL::Meshes::VTK::CellGhostTypes::EXTERIORCELL, "the cell is on the exterior of the data set") + .value("HIDDENCELL", TNL::Meshes::VTK::CellGhostTypes::HIDDENCELL, "the cell is needed to maintain connectivity, but the data values should be ignored") + ; + py::enum_< TNL::Meshes::VTK::PointGhostTypes >( m, "VTKPointGhostTypes") + .value("DUPLICATEPOINT", TNL::Meshes::VTK::PointGhostTypes::DUPLICATEPOINT, "the cell is present on multiple processors") + .value("HIDDENPOINT", TNL::Meshes::VTK::PointGhostTypes::HIDDENPOINT, "the point is needed to maintain connectivity, but the data values should be ignored") + ; +} diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp index 75bd08421..2b2a852fa 100644 --- a/src/Python/pytnl/tnl/tnl.cpp +++ b/src/Python/pytnl/tnl/tnl.cpp @@ -13,6 +13,7 @@ void export_String( py::module & m ); void export_Grid1D( py::module & m ); void export_Grid2D( py::module & m ); void export_Grid3D( py::module & m ); +void export_VTKTraits( py::module & m ); void export_Meshes( py::module & m ); void export_MeshReaders( py::module & m ); void export_SparseMatrices( py::module & m ); @@ -42,6 +43,8 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m) export_Grid2D(m); export_Grid3D(m); + export_VTKTraits(m); + export_Meshes(m); export_MeshReaders(m); diff --git a/src/TNL/Meshes/VTKTraits.h b/src/TNL/Meshes/VTKTraits.h index e09b6c342..0883b607a 100644 --- a/src/TNL/Meshes/VTKTraits.h +++ b/src/TNL/Meshes/VTKTraits.h @@ -172,16 +172,16 @@ enum class CellGhostTypes DUPLICATECELL = 1, // the cell is present on multiple processors HIGHCONNECTIVITYCELL = 2, // the cell has more neighbors than in a regular mesh LOWCONNECTIVITYCELL = 4, // the cell has less neighbors than in a regular mesh - REFINEDCELL = 8, // other cells are present that refines it. + REFINEDCELL = 8, // other cells are present that refines it EXTERIORCELL = 16, // the cell is on the exterior of the data set - HIDDENCELL = 32 // the cell is needed to maintain connectivity, but the data values should be ignored. + HIDDENCELL = 32 // the cell is needed to maintain connectivity, but the data values should be ignored }; enum class PointGhostTypes : std::uint8_t { DUPLICATEPOINT = 1, // the cell is present on multiple processors - HIDDENPOINT = 2 // the point is needed to maintain connectivity, but the data values should be ignored. + HIDDENPOINT = 2 // the point is needed to maintain connectivity, but the data values should be ignored }; /** -- GitLab From 85c1a04e2321a137c9c1e295341157f9ff496bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 4 Nov 2020 23:09:09 +0100 Subject: [PATCH 09/50] Mesh writers: avoid reference in class members, initialize by rdbuf Using references as class members is weird, because they cannot extend the lifetime of the objects they are initialized with. Using values of the type std::ostream and initializing by rdbuf (which is a pointer) works better, probably because the underlying rdbuf generally outlives the ostreams that were being passed to the writers. --- src/TNL/Meshes/Writers/PVTUWriter.h | 4 ++-- src/TNL/Meshes/Writers/VTKWriter.h | 4 ++-- src/TNL/Meshes/Writers/VTUWriter.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h index 8ef4d2b7b..5aa9cd2b0 100644 --- a/src/TNL/Meshes/Writers/PVTUWriter.h +++ b/src/TNL/Meshes/Writers/PVTUWriter.h @@ -31,7 +31,7 @@ public: PVTUWriter() = delete; PVTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed ) - : str(str), format(format) + : str(str.rdbuf()), format(format) {} // If desired, cycle and time of the simulation can put into the file. This follows the instructions at @@ -79,7 +79,7 @@ protected: void writeFooter(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; diff --git a/src/TNL/Meshes/Writers/VTKWriter.h b/src/TNL/Meshes/Writers/VTKWriter.h index e1c5fae97..db0c09b13 100644 --- a/src/TNL/Meshes/Writers/VTKWriter.h +++ b/src/TNL/Meshes/Writers/VTKWriter.h @@ -45,7 +45,7 @@ public: VTKWriter() = delete; VTKWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::binary ) - : str(str), format(format) + : str(str.rdbuf()), format(format) { if( format != VTK::FileFormat::ascii && format != VTK::FileFormat::binary ) throw std::domain_error("The Legacy VTK file formats support only ASCII and BINARY formats."); @@ -78,7 +78,7 @@ protected: void writeHeader(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; diff --git a/src/TNL/Meshes/Writers/VTUWriter.h b/src/TNL/Meshes/Writers/VTUWriter.h index 9f715dce6..00765cc0d 100644 --- a/src/TNL/Meshes/Writers/VTUWriter.h +++ b/src/TNL/Meshes/Writers/VTUWriter.h @@ -44,7 +44,7 @@ public: VTUWriter() = delete; VTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed ) - : str(str), format(format) + : str(str.rdbuf()), format(format) {} // If desired, cycle and time of the simulation can put into the file. This follows the instructions at @@ -78,7 +78,7 @@ protected: void writeFooter(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; -- GitLab From 912af6da79cb51f6f2faaaa67d8e1f9e98374ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 5 Nov 2020 23:47:09 +0100 Subject: [PATCH 10/50] Fixed HostArray type in mesh writers - ValueType must be non-const (otherwise it won't work with an array view with a const ValueType) - IndexType should be taken from the array --- src/TNL/Meshes/Writers/VTKWriter.hpp | 2 +- src/TNL/Meshes/Writers/VTUWriter.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TNL/Meshes/Writers/VTKWriter.hpp b/src/TNL/Meshes/Writers/VTKWriter.hpp index 125366d03..801d3bc19 100644 --- a/src/TNL/Meshes/Writers/VTKWriter.hpp +++ b/src/TNL/Meshes/Writers/VTKWriter.hpp @@ -509,7 +509,7 @@ VTKWriter< Mesh >::writeDataArray( const Array& array, // use a host buffer if direct access to the array elements is not possible if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value ) { - using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >; + using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >; HostArray hostBuffer; hostBuffer = array; writeDataArray( hostBuffer, name, numberOfComponents ); diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp index 8d609f0a7..61872ffe1 100644 --- a/src/TNL/Meshes/Writers/VTUWriter.hpp +++ b/src/TNL/Meshes/Writers/VTUWriter.hpp @@ -459,7 +459,7 @@ VTUWriter< Mesh >::writeDataArray( const Array& array, // use a host buffer if direct access to the array elements is not possible if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value ) { - using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >; + using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >; HostArray hostBuffer; hostBuffer = array; writeDataArray( hostBuffer, name, numberOfComponents ); -- GitLab From b426950f236e14edf99ed11a257adab9475176d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 4 Nov 2020 23:14:04 +0100 Subject: [PATCH 11/50] pytnl: added bindings for mesh writers --- src/3rdparty/CMakeLists.txt | 6 + src/3rdparty/cctbx/pystreambuf.h | 519 +++++++++++++++++++++++++++ src/Python/pytnl/iostream_caster.h | 59 +++ src/Python/pytnl/tnl/CMakeLists.txt | 1 + src/Python/pytnl/tnl/MeshWriters.cpp | 88 +++++ src/Python/pytnl/tnl/MeshWriters.h | 22 ++ src/Python/pytnl/tnl/tnl.cpp | 2 + src/Python/pytnl/tnl_conversions.h | 1 + 8 files changed, 698 insertions(+) create mode 100644 src/3rdparty/cctbx/pystreambuf.h create mode 100644 src/Python/pytnl/iostream_caster.h create mode 100644 src/Python/pytnl/tnl/MeshWriters.cpp create mode 100644 src/Python/pytnl/tnl/MeshWriters.h diff --git a/src/3rdparty/CMakeLists.txt b/src/3rdparty/CMakeLists.txt index 6dba288f0..01550de19 100644 --- a/src/3rdparty/CMakeLists.txt +++ b/src/3rdparty/CMakeLists.txt @@ -1,3 +1,9 @@ install( DIRECTORY mpark Leksys TYPE INCLUDE MESSAGE_NEVER FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" ) + +if( ${WITH_PYTHON} ) + install( DIRECTORY cctbx TYPE INCLUDE + MESSAGE_NEVER + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" ) +endif() diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h new file mode 100644 index 000000000..6e0c497e4 --- /dev/null +++ b/src/3rdparty/cctbx/pystreambuf.h @@ -0,0 +1,519 @@ +/* Original code: https://gist.github.com/asford/544323a5da7dddad2c9174490eb5ed06 + * License: + * This component utilizes components derived from cctbx, available at + * http://cci.lbl.gov/cctbx_sources/boost_adaptbx/python_streambuf.h + * + * *** License agreement *** + * + * cctbx Copyright (c) 2006, The Regents of the University of + * California, through Lawrence Berkeley National Laboratory (subject to + * receipt of any required approvals from the U.S. Dept. of Energy). All + * rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * (1) Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * (2) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * (3) Neither the name of the University of California, Lawrence Berkeley + * National Laboratory, U.S. Dept. of Energy nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You are under no obligation whatsoever to provide any bug fixes, + * patches, or upgrades to the features, functionality or performance of + * the source code ("Enhancements") to anyone; however, if you choose to + * make your Enhancements available either publicly, or directly to + * Lawrence Berkeley National Laboratory, without imposing a separate + * written license agreement for such Enhancements, then you hereby grant + * the following license: a non-exclusive, royalty-free perpetual license + * to install, use, modify, prepare derivative works, incorporate into + * other computer software, distribute, and sublicense such enhancements or + * derivative works thereof, in binary and source code form. +*/ + +#pragma once + +#include +#include + +#include + +namespace pystreambuf { + +/// A stream buffer getting data from and putting data into a Python file object +/** The aims are as follow: + + - Given a C++ function acting on a standard stream, e.g. + + \code + void read_inputs(std::istream& input) { + ... + input >> something >> something_else; + } + \endcode + + and given a piece of Python code which creates a file-like object, + to be able to pass this file object to that C++ function, e.g. + + \code + import gzip + gzip_file_obj = gzip.GzipFile(...) + read_inputs(gzip_file_obj) + \endcode + + and have the standard stream pull data from and put data into the Python + file object. + + - When Python \c read_inputs() returns, the Python object is able to + continue reading or writing where the C++ code left off. + + - Operations in C++ on mere files should be competitively fast compared + to the direct use of \c std::fstream. + + + \b Motivation + + - the standard Python library offer of file-like objects (files, + compressed files and archives, network, ...) is far superior to the + offer of streams in the C++ standard library and Boost C++ libraries. + + - i/o code involves a fair amount of text processing which is more + efficiently prototyped in Python but then one may need to rewrite + a time-critical part in C++, in as seamless a manner as possible. + + \b Usage + + This is 2-step: + + - a trivial wrapper function + + \code + using boost_adaptbx::python::streambuf; + void read_inputs_wrapper(streambuf& input) + { + streambuf::istream is(input); + read_inputs(is); + } + + def("read_inputs", read_inputs_wrapper); + \endcode + + which has to be written every time one wants a Python binding for + such a C++ function. + + - the Python side + + \code + from boost.python import streambuf + read_inputs(streambuf(python_file_obj=obj, buffer_size=1024)) + \endcode + + \c buffer_size is optional. See also: \c default_buffer_size + + Note: references are to the C++ standard (the numbers between parentheses + at the end of references are margin markers). +*/ +class streambuf : public std::basic_streambuf +{ + private: + typedef std::basic_streambuf base_t; + + public: + /* The syntax + using base_t::char_type; + would be nicer but Visual Studio C++ 8 chokes on it + */ + typedef base_t::char_type char_type; + typedef base_t::int_type int_type; + typedef base_t::pos_type pos_type; + typedef base_t::off_type off_type; + typedef base_t::traits_type traits_type; + + /// The default size of the read and write buffer. + /** They are respectively used to buffer data read from and data written to + the Python file object. It can be modified from Python. + */ + static constexpr std::size_t default_buffer_size = 1024; + + /// Construct from a Python file object + /** if buffer_size is 0 the current default_buffer_size is used. + */ + streambuf( + pybind11::object& python_file_obj, + std::size_t buffer_size_=0) + : + py_read (getattr(python_file_obj, "read", pybind11::none())), + py_write (getattr(python_file_obj, "write", pybind11::none())), + py_seek (getattr(python_file_obj, "seek", pybind11::none())), + py_tell (getattr(python_file_obj, "tell", pybind11::none())), + buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size), + write_buffer(0), + pos_of_read_buffer_end_in_py_file(0), + pos_of_write_buffer_end_in_py_file(buffer_size), + farthest_pptr(0) + { + assert(buffer_size != 0); + /* Some Python file objects (e.g. sys.stdout and sys.stdin) + have non-functional seek and tell. If so, assign None to + py_tell and py_seek. + */ + if (!py_tell.is_none()) { + try { + py_tell(); + } + catch (pybind11::error_already_set& err) { + py_tell = pybind11::none(); + py_seek = pybind11::none(); + err.restore(); + PyErr_Clear(); + } + } + + if (!py_write.is_none()) { + // C-like string to make debugging easier + write_buffer = new char[buffer_size + 1]; + write_buffer[buffer_size] = '\0'; + setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + else { + // The first attempt at output will result in a call to overflow + setp(0, 0); + } + + if (!py_tell.is_none()){ + off_type py_pos = py_tell().cast(); + pos_of_read_buffer_end_in_py_file = py_pos; + pos_of_write_buffer_end_in_py_file = py_pos; + } + } + + /// Mundane destructor freeing the allocated resources + virtual ~streambuf() { + if (write_buffer) delete[] write_buffer; + } + + /// C.f. C++ standard section 27.5.2.4.3 + /** It is essential to override this virtual function for the stream + member function readsome to work correctly (c.f. 27.6.1.3, alinea 30) + */ + virtual std::streamsize showmanyc() { + int_type const failure = traits_type::eof(); + int_type status = underflow(); + if (status == failure) return -1; + return egptr() - gptr(); + } + + /// C.f. C++ standard section 27.5.2.4.3 + virtual int_type underflow() { + int_type const failure = traits_type::eof(); + if (py_read.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'read' attribute"); + } + read_buffer = py_read(buffer_size); + char *read_buffer_data; + pybind11::ssize_t py_n_read; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(read_buffer.ptr(), + &read_buffer_data, &py_n_read) == -1) { + setg(0, 0, 0); + throw std::invalid_argument( + "The method 'read' of the Python file object " + "did not return a string."); + } + off_type n_read = (off_type)py_n_read; + pos_of_read_buffer_end_in_py_file += n_read; + setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read); + // ^^^27.5.2.3.1 (4) + if (n_read == 0) return failure; + return traits_type::to_int_type(read_buffer_data[0]); + } + + /// C.f. C++ standard section 27.5.2.4.5 + virtual int_type overflow(int_type c=traits_type::eof()) { + if (py_write.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'write' attribute"); + } + farthest_pptr = std::max(farthest_pptr, pptr()); + off_type n_written = (off_type)(farthest_pptr - pbase()); + pybind11::bytes chunk(pbase(), n_written); + py_write(chunk); + if (!traits_type::eq_int_type(c, traits_type::eof())) { + py_write(traits_type::to_char_type(c)); + n_written++; + } + if (n_written) { + pos_of_write_buffer_end_in_py_file += n_written; + setp(pbase(), epptr()); + // ^^^ 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + return traits_type::eq_int_type( + c, traits_type::eof()) ? traits_type::not_eof(c) : c; + } + + /// Update the python file to reflect the state of this stream buffer + /** Empty the write buffer into the Python file object and set the seek + position of the latter accordingly (C++ standard section 27.5.2.4.2). + If there is no write buffer or it is empty, but there is a non-empty + read buffer, set the Python file object seek position to the + seek position in that read buffer. + */ + virtual int sync() { + int result = 0; + farthest_pptr = std::max(farthest_pptr, pptr()); + if (farthest_pptr && farthest_pptr > pbase()) { + off_type delta = pptr() - farthest_pptr; + int_type status = overflow(); + if (traits_type::eq_int_type(status, traits_type::eof())) result = -1; + if (!py_seek.is_none()) py_seek(delta, 1); + } + else if (gptr() && gptr() < egptr()) { + if (!py_seek.is_none()) py_seek(gptr() - egptr(), 1); + } + return result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + /** This implementation is optimised to look whether the position is within + the buffers, so as to avoid calling Python seek or tell. It is + important for many applications that the overhead of calling into Python + is avoided as much as possible (e.g. parsers which may do a lot of + backtracking) + */ + virtual + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + /* In practice, "which" is either std::ios_base::in or out + since we end up here because either seekp or seekg was called + on the stream using this buffer. That simplifies the code + in a few places. + */ + int const failure = off_type(-1); + + if (py_seek.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'seek' attribute"); + } + + // we need the read buffer to contain something! + if (which == std::ios_base::in && !gptr()) { + if (traits_type::eq_int_type(underflow(), traits_type::eof())) { + return failure; + } + } + + // compute the whence parameter for Python seek + int whence; + switch (way) { + case std::ios_base::beg: + whence = 0; + break; + case std::ios_base::cur: + whence = 1; + break; + case std::ios_base::end: + whence = 2; + break; + default: + return failure; + } + + // Let's have a go + off_type result; + if (!seekoff_without_calling_python(off, way, which, result)) { + // we need to call Python + if (which == std::ios_base::out) overflow(); + if (way == std::ios_base::cur) { + if (which == std::ios_base::in) off -= egptr() - gptr(); + else if (which == std::ios_base::out) off += pptr() - pbase(); + } + py_seek(off, whence); + result = off_type(py_tell().cast()); + if (which == std::ios_base::in) underflow(); + } + return result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + virtual + pos_type seekpos(pos_type sp, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + return streambuf::seekoff(sp, std::ios_base::beg, which); + } + + private: + pybind11::object py_read, py_write, py_seek, py_tell; + + std::size_t buffer_size; + + /* This is actually a Python bytes object and the actual read buffer is + its internal data, i.e. an array of characters. + */ + pybind11::bytes read_buffer; + + /* A mere array of char's allocated on the heap at construction time and + de-allocated only at destruction time. + */ + char *write_buffer; + + off_type pos_of_read_buffer_end_in_py_file, + pos_of_write_buffer_end_in_py_file; + + // the farthest place the buffer has been written into + char *farthest_pptr; + + + bool seekoff_without_calling_python( + off_type off, + std::ios_base::seekdir way, + std::ios_base::openmode which, + off_type & result) + { + // Buffer range and current position + off_type buf_begin, buf_end, buf_cur, upper_bound; + off_type pos_of_buffer_end_in_py_file; + if (which == std::ios_base::in) { + pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file; + buf_begin = reinterpret_cast(eback()); + buf_cur = reinterpret_cast(gptr()); + buf_end = reinterpret_cast(egptr()); + upper_bound = buf_end; + } + else if (which == std::ios_base::out) { + pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file; + buf_begin = reinterpret_cast(pbase()); + buf_cur = reinterpret_cast(pptr()); + buf_end = reinterpret_cast(epptr()); + farthest_pptr = std::max(farthest_pptr, pptr()); + upper_bound = reinterpret_cast(farthest_pptr) + 1; + } + else { + std::runtime_error( + "Control flow passes through branch that should be unreachable."); + } + + // Sought position in "buffer coordinate" + off_type buf_sought; + if (way == std::ios_base::cur) { + buf_sought = buf_cur + off; + } + else if (way == std::ios_base::beg) { + buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file); + } + else if (way == std::ios_base::end) { + return false; + } + else { + std::runtime_error( + "Control flow passes through branch that should be unreachable."); + } + + // if the sought position is not in the buffer, give up + if (buf_sought < buf_begin || buf_sought >= upper_bound) return false; + + // we are in wonderland + if (which == std::ios_base::in) gbump(buf_sought - buf_cur); + else if (which == std::ios_base::out) pbump(buf_sought - buf_cur); + + result = pos_of_buffer_end_in_py_file + (buf_sought - buf_end); + return true; + } + + public: + + class istream : public std::istream + { + public: + istream(streambuf& buf) : std::istream(&buf) + { + exceptions(std::ios_base::badbit); + } + + ~istream() { if (this->good()) this->sync(); } + }; + + class ostream : public std::ostream + { + public: + ostream(streambuf& buf) : std::ostream(&buf) + { + exceptions(std::ios_base::badbit); + } + + ~ostream() { if (this->good()) this->flush(); } + }; +}; + +struct streambuf_capsule +{ + streambuf python_streambuf; + + streambuf_capsule( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + python_streambuf(python_file_obj, buffer_size) + {} +}; + +struct ostream : private streambuf_capsule, streambuf::ostream +{ + ostream( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + streambuf_capsule(python_file_obj, buffer_size), + streambuf::ostream(python_streambuf) + {} + + ~ostream() + { + if (this->good()){ + this->flush(); + } + } +}; + +struct istream : private streambuf_capsule, streambuf::istream +{ + istream( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + streambuf_capsule(python_file_obj, buffer_size), + streambuf::istream(python_streambuf) + {} + + ~istream() + { + if (this->good()) { + this->sync(); + } + } +}; + +} // namespace pystreambuf diff --git a/src/Python/pytnl/iostream_caster.h b/src/Python/pytnl/iostream_caster.h new file mode 100644 index 000000000..38f5d4e16 --- /dev/null +++ b/src/Python/pytnl/iostream_caster.h @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace pybind11 { namespace detail { + template <> struct type_caster { + public: + bool load(handle src, bool) { + if (getattr(src, "read", none()).is_none()){ + return false; + } + + obj = reinterpret_borrow(src); + value = std::unique_ptr(new pystreambuf::istream(obj, 0)); + + return true; + } + + protected: + object obj; + std::unique_ptr value; + + public: + static constexpr auto name = _("istream"); + static handle cast(const std::istream *src, return_value_policy policy, handle parent) { + return none().release(); + } + operator std::istream*() { return value.get(); } + operator std::istream&() { return *value; } + template using cast_op_type = pybind11::detail::cast_op_type<_T>; + }; + + template <> struct type_caster { + public: + bool load(handle src, bool) { + if (getattr(src, "write", none()).is_none()){ + return false; + } + + obj = reinterpret_borrow(src); + value = std::unique_ptr(new pystreambuf::ostream(obj, 0)); + + return true; + } + + protected: + object obj; + std::unique_ptr value; + + public: + static constexpr auto name = _("ostream"); + static handle cast(const std::ostream *src, return_value_policy policy, handle parent) { + return none().release(); + } + operator std::ostream*() { return value.get(); } + operator std::ostream&() { return *value; } + template using cast_op_type = pybind11::detail::cast_op_type<_T>; + }; +}} // namespace pybind11::detail diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt index 34fe9b179..dc1c3fcc3 100644 --- a/src/Python/pytnl/tnl/CMakeLists.txt +++ b/src/Python/pytnl/tnl/CMakeLists.txt @@ -7,6 +7,7 @@ set( sources Grid3D.cpp Mesh.cpp MeshReaders.cpp + MeshWriters.cpp Object.cpp SparseMatrix.cpp String.cpp diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp new file mode 100644 index 000000000..17c3c7492 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshWriters.cpp @@ -0,0 +1,88 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "MeshWriters.h" +#include "../typedefs.h" + +#include + +#include +#include + +template< typename Writer, TNL::Meshes::VTK::FileFormat default_format > +void export_MeshWriter( py::module & m, const char* name ) +{ + // We cannot use MeshReader::VariantVector for Python bindings, because its variants are + // std::vector for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t, + // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map + // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given + // Python value fits into the C++ type when selecting the alternative for a scalar type, and + // for containers like std::vector it merely selects the first possible type. For reference, see + // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161 + using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >; + + // Binding to Writer directly is not possible, because the writer has a std::ostream attribute + // which would reference the streambuf created by the type caster from the Python file-like object. + // However, the streambuf would be destroyed as soon as the writer is constructed and control + // returned to Python, so the following invokations would use an invalid object and segfault. + // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own + // ostream attribute and is initialized by a py::object to avoid type casting. + using PythonWriter = PyWriter< Writer, default_format >; + py::class_< PythonWriter >( m, name ) + .def(py::init(), py::keep_alive<1, 2>(), + py::arg("stream"), py::pos_only(), py::arg("format") = default_format) + .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1) + .def("writeVertices", &Writer::template writeEntities< 0 >) + .def("writeCells", &Writer::template writeEntities<>) + // we use the VariantVector from MeshReader because we already have a caster for it + .def("writePointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writePointData( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writeCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writeCellData( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writeDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writeDataArray( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + ; +} + +void export_MeshWriters( py::module & m ) +{ + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfEdges" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTriangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTetrahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" ); +} diff --git a/src/Python/pytnl/tnl/MeshWriters.h b/src/Python/pytnl/tnl/MeshWriters.h new file mode 100644 index 000000000..9dd7185ea --- /dev/null +++ b/src/Python/pytnl/tnl/MeshWriters.h @@ -0,0 +1,22 @@ +#include "../iostream_caster.h" +#include + +// helper struct is needed to ensure correct initialization order in the PyWriter constructor +struct PyOstreamHelper +{ + py::object obj; + pystreambuf::ostream str; + + PyOstreamHelper( py::object src ) + : obj(py::reinterpret_borrow(src)), + str(obj) + {} +}; + +template< typename Writer, TNL::Meshes::VTK::FileFormat default_format > +struct PyWriter : public PyOstreamHelper, public Writer +{ + PyWriter( py::object src, TNL::Meshes::VTK::FileFormat format = default_format ) + : PyOstreamHelper(src), Writer(str) + {} +}; diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp index 2b2a852fa..65e9c14e4 100644 --- a/src/Python/pytnl/tnl/tnl.cpp +++ b/src/Python/pytnl/tnl/tnl.cpp @@ -16,6 +16,7 @@ void export_Grid3D( py::module & m ); void export_VTKTraits( py::module & m ); void export_Meshes( py::module & m ); void export_MeshReaders( py::module & m ); +void export_MeshWriters( py::module & m ); void export_SparseMatrices( py::module & m ); template< typename T > @@ -47,6 +48,7 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m) export_Meshes(m); export_MeshReaders(m); + export_MeshWriters(m); export_SparseMatrices(m); } diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h index e942db324..788a54813 100644 --- a/src/Python/pytnl/tnl_conversions.h +++ b/src/Python/pytnl/tnl_conversions.h @@ -2,3 +2,4 @@ #include "tnl_str_conversion.h" #include "tnl_tuple_conversion.h" #include "variant_caster.h" +#include "iostream_caster.h" -- GitLab From 9123ea3ff10028ce2bc1ab030e1dd3a6b0df6947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 6 Nov 2020 22:58:46 +0100 Subject: [PATCH 12/50] pytnl: added bindings for distributed mesh writers --- src/Python/pytnl/tnl_mpi/CMakeLists.txt | 1 + .../pytnl/tnl_mpi/DistributedMeshWriters.cpp | 94 +++++++++++++++++++ src/Python/pytnl/tnl_mpi/tnl_mpi.cpp | 2 + 3 files changed, 97 insertions(+) create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt index ee5e9cc32..2aa8f73da 100644 --- a/src/Python/pytnl/tnl_mpi/CMakeLists.txt +++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt @@ -4,6 +4,7 @@ set(PYBIND11_CPP_STANDARD -std=c++14) set( sources DistributedMesh.cpp DistributedMeshReaders.cpp + DistributedMeshWriters.cpp tnl_mpi.cpp ) pybind11_add_module( pytnl_mpi ${sources} ) diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp new file mode 100644 index 000000000..4d1d18bae --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp @@ -0,0 +1,94 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../tnl/MeshWriters.h" +#include "../typedefs.h" + +#include + +#include + +template< template class WriterTemplate, typename LocalMesh, TNL::Meshes::VTK::FileFormat default_format > +void export_DistributedMeshWriter( py::module & m, const char* name ) +{ + using Writer = WriterTemplate< LocalMesh >; + using Mesh = TNL::Meshes::DistributedMeshes::DistributedMesh< LocalMesh >; + + // We cannot use MeshReader::VariantVector for Python bindings, because its variants are + // std::vector for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t, + // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map + // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given + // Python value fits into the C++ type when selecting the alternative for a scalar type, and + // for containers like std::vector it merely selects the first possible type. For reference, see + // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161 + using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >; + + // Binding to Writer directly is not possible, because the writer has a std::ostream attribute + // which would reference the streambuf created by the type caster from the Python file-like object. + // However, the streambuf would be destroyed as soon as the writer is constructed and control + // returned to Python, so the following invokations would use an invalid object and segfault. + // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own + // ostream attribute and is initialized by a py::object to avoid type casting. + using PythonWriter = PyWriter< Writer, default_format >; + py::class_< PythonWriter >( m, name ) + .def(py::init(), py::keep_alive<1, 2>(), + py::arg("stream"), py::pos_only(), py::arg("format") = default_format) + .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1) + .def("writeVertices", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities< 0 >), + py::arg("distributedMesh")) + .def("writeVertices", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities< 0 >), + py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0) + .def("writeCells", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities<>), + py::arg("distributedMesh")) + .def("writeCells", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities<>), + py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0) + // INCONSISTENCY: the C++ methods writePPointData, writePCellData, writePDataArray do not + // take the whole array as parameter, only the ValueType as a template parameter. Since + // this does not map nicely to Python, we pass the whole array just like in the + // VTKWriter and VTUWriter classes. + // we use the VariantVector from MeshReader because we already have a caster for it + .def("writePPointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePPointData< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writePCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePCellData< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writePDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePDataArray< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + // NOTE: only the overload intended for sequential writing is exported, because we don't + // have type casters for Communicators::MpiCommunicator::CommunicationGroup + // (ideally, the communication group would be compatible with the mpi4py objects) + .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ), + py::arg("mainFileName"), py::arg("subdomainIndex")) + ; +} + +void export_DistributedMeshWriters( py::module & m ) +{ + constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed; + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges, default_format >( m, "PVTUWriter_MeshOfEdges" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles, default_format >( m, "PVTUWriter_MeshOfTriangles" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" ); +} diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp index de2359ac2..be7813959 100644 --- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -7,6 +7,7 @@ // external functions void export_DistributedMeshes( py::module & m ); void export_DistributedMeshReaders( py::module & m ); +void export_DistributedMeshWriters( py::module & m ); #include @@ -32,6 +33,7 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m) // bindings for distributed data structures export_DistributedMeshes(m); export_DistributedMeshReaders(m); + export_DistributedMeshWriters(m); // bindings for functions using TNL::Meshes::DistributedMeshes::distributeSubentities; -- GitLab From 5a62beedbd1da99c2da0ecd41153d4ee74a034a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 7 Nov 2020 11:38:02 +0100 Subject: [PATCH 13/50] pystreambuf: fixed broken overflow() method and enabled exceptions for failbit --- src/3rdparty/cctbx/pystreambuf.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h index 6e0c497e4..d2d67730a 100644 --- a/src/3rdparty/cctbx/pystreambuf.h +++ b/src/3rdparty/cctbx/pystreambuf.h @@ -188,9 +188,8 @@ class streambuf : public std::basic_streambuf } if (!py_write.is_none()) { - // C-like string to make debugging easier + // add one extra byte for characters passed to the overflow() method write_buffer = new char[buffer_size + 1]; - write_buffer[buffer_size] = '\0'; setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5) farthest_pptr = pptr(); } @@ -255,12 +254,13 @@ class streambuf : public std::basic_streambuf } farthest_pptr = std::max(farthest_pptr, pptr()); off_type n_written = (off_type)(farthest_pptr - pbase()); - pybind11::bytes chunk(pbase(), n_written); - py_write(chunk); if (!traits_type::eq_int_type(c, traits_type::eof())) { - py_write(traits_type::to_char_type(c)); - n_written++; + // add the overflown character to the end of the buffer + // (we have one extra byte just for that) + write_buffer[n_written++] = traits_type::to_char_type(c); } + pybind11::bytes chunk(pbase(), n_written); + py_write(chunk); if (n_written) { pos_of_write_buffer_end_in_py_file += n_written; setp(pbase(), epptr()); @@ -450,7 +450,7 @@ class streambuf : public std::basic_streambuf public: istream(streambuf& buf) : std::istream(&buf) { - exceptions(std::ios_base::badbit); + exceptions(std::ios_base::badbit | std::ios_base::failbit); } ~istream() { if (this->good()) this->sync(); } @@ -461,7 +461,7 @@ class streambuf : public std::basic_streambuf public: ostream(streambuf& buf) : std::ostream(&buf) { - exceptions(std::ios_base::badbit); + exceptions(std::ios_base::badbit | std::ios_base::failbit); } ~ostream() { if (this->good()) this->flush(); } -- GitLab From afcc762a2f0f932051193d036b7d8a1a81122e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 8 Nov 2020 12:09:34 +0100 Subject: [PATCH 14/50] Mesh readers: changed methods readPointData and readCellData to be virtual Also renamed PVTUReader's readLocalPointData and readLocalCellData to fit into this hierarchy, hopefully it is clear that they return only local data. --- src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp | 2 -- src/TNL/Meshes/Readers/MeshReader.h | 12 ++++++++++++ src/TNL/Meshes/Readers/PVTUReader.h | 8 ++++---- src/TNL/Meshes/Readers/XMLVTK.h | 8 ++++---- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp index 7847e340b..e972eb65e 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp @@ -20,7 +20,5 @@ void export_DistributedMeshReaders( py::module & m ) .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >) .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >) .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >) - .def("readLocalPointData", &PVTUReader::readLocalPointData) - .def("readLocalCellData", &PVTUReader::readLocalCellData) ; } diff --git a/src/TNL/Meshes/Readers/MeshReader.h b/src/TNL/Meshes/Readers/MeshReader.h index 88e2986ba..8bf8189ba 100644 --- a/src/TNL/Meshes/Readers/MeshReader.h +++ b/src/TNL/Meshes/Readers/MeshReader.h @@ -150,6 +150,18 @@ public: throw MeshReaderError( "VTKReader", "MeshBuilder failed" ); } + virtual VariantVector + readPointData( std::string arrayName ) + { + throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." ); + } + + virtual VariantVector + readCellData( std::string arrayName ) + { + throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." ); + } + std::string getMeshType() const { diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h index 4bb8ba7eb..393ee1551 100644 --- a/src/TNL/Meshes/Readers/PVTUReader.h +++ b/src/TNL/Meshes/Readers/PVTUReader.h @@ -211,14 +211,14 @@ public: mesh.setCommunicationGroup( group ); } - VariantVector - readLocalPointData( std::string arrayName ) + virtual VariantVector + readPointData( std::string arrayName ) override { return localReader.readPointData( arrayName ); } - VariantVector - readLocalCellData( std::string arrayName ) + virtual VariantVector + readCellData( std::string arrayName ) override { return localReader.readCellData( arrayName ); } diff --git a/src/TNL/Meshes/Readers/XMLVTK.h b/src/TNL/Meshes/Readers/XMLVTK.h index fb8e1eb40..af864e6e9 100644 --- a/src/TNL/Meshes/Readers/XMLVTK.h +++ b/src/TNL/Meshes/Readers/XMLVTK.h @@ -325,8 +325,8 @@ public: #endif } - VariantVector - readPointData( std::string arrayName ) + virtual VariantVector + readPointData( std::string arrayName ) override { #ifdef HAVE_TINYXML2 return readPointOrCellData( "PointData", arrayName ); @@ -335,8 +335,8 @@ public: #endif } - VariantVector - readCellData( std::string arrayName ) + virtual VariantVector + readCellData( std::string arrayName ) override { #ifdef HAVE_TINYXML2 return readPointOrCellData( "CellData", arrayName ); -- GitLab From 7734ee3c56083c9fb8e3a48dd42cce32abd33165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 8 Nov 2020 12:32:39 +0100 Subject: [PATCH 15/50] Added function getMeshReader It is useful especially when one wants to load mesh functions via the readPointData or readCellData methods when the mesh was already loaded. --- src/TNL/Meshes/Readers/getMeshReader.h | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/TNL/Meshes/Readers/getMeshReader.h diff --git a/src/TNL/Meshes/Readers/getMeshReader.h b/src/TNL/Meshes/Readers/getMeshReader.h new file mode 100644 index 000000000..2c2c18a8e --- /dev/null +++ b/src/TNL/Meshes/Readers/getMeshReader.h @@ -0,0 +1,58 @@ +/*************************************************************************** + getMeshReader.h - description + ------------------- + begin : Nov 7, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +#include + +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { +namespace Readers { + +std::shared_ptr< Readers::MeshReader > +getMeshReader( const std::string& fileName, + const std::string& fileFormat ) +{ + namespace fs = std::experimental::filesystem; + std::string format = fileFormat; + if( format == "auto" ) { + format = fs::path(fileName).extension(); + if( format.length() > 0 ) + // remove dot from the extension + format = format.substr(1); + } + + if( format == "ng" ) + return std::make_shared< Readers::NetgenReader >( fileName ); + else if( format == "vtk" ) + return std::make_shared< Readers::VTKReader >( fileName ); + else if( format == "vtu" ) + return std::make_shared< Readers::VTUReader >( fileName ); + else if( format == "pvtu" ) + return std::make_shared< Readers::PVTUReader >( fileName ); + + if( fileFormat == "auto" ) + std::cerr << "File '" << fileName << "' has unsupported format (based on the file extension): " << format << "."; + else + std::cerr << "Unsupported fileFormat parameter: " << fileFormat << "."; + std::cerr << " Supported formats are 'vtk', 'vtu', 'pvtu' and 'ng'." << std::endl; + return nullptr; +} + +} // namespace Readers +} // namespace Meshes +} // namespace TNL -- GitLab From 0f95798ee8f044a904607b0c6d876230a950a46a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 8 Nov 2020 22:13:09 +0100 Subject: [PATCH 16/50] Added parameter --redirect-mpi-output-dir to MpiCommunicator --- src/TNL/Communicators/MpiCommunicator.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index dd119e813..18143cce0 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -73,6 +73,7 @@ class MpiCommunicator { #ifdef HAVE_MPI config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true ); + config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." ); config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false ); config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 ); #endif @@ -85,8 +86,9 @@ class MpiCommunicator if(IsInitialized())//i.e. - isUsed { const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); + const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" ); if( redirect ) - setupRedirection(); + setupRedirection( outputDirectory ); #ifdef HAVE_CUDA int size; MPI_Comm_size( MPI_COMM_WORLD, &size ); @@ -152,15 +154,15 @@ class MpiCommunicator (void) NullRequest; } - static void setupRedirection() + static void setupRedirection( std::string outputDirectory ) { #ifdef HAVE_MPI if(isDistributed() ) { if(GetRank(AllGroup)!=0) { - const std::string stdoutFile = std::string("./stdout_") + std::to_string(GetRank(AllGroup)) + ".txt"; - const std::string stderrFile = std::string("./stderr_") + std::to_string(GetRank(AllGroup)) + ".txt"; + const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank(AllGroup)) + ".txt"; + const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank(AllGroup)) + ".txt"; std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl; Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); } -- GitLab From 0052f917e114ebc2f75afa1d23ad7b433f4f3a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 10 Nov 2020 20:03:19 +0100 Subject: [PATCH 17/50] pytnl: updated bindings for Mesh, added missing methods --- src/Python/pytnl/tnl/EntityTypes.h | 36 ----------------------------- src/Python/pytnl/tnl/Grid.h | 16 ++++++------- src/Python/pytnl/tnl/Mesh.h | 36 ++++++++++++++++++----------- src/Python/pytnl/tnl/mesh_getters.h | 36 +++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 57 deletions(-) delete mode 100644 src/Python/pytnl/tnl/EntityTypes.h create mode 100644 src/Python/pytnl/tnl/mesh_getters.h diff --git a/src/Python/pytnl/tnl/EntityTypes.h b/src/Python/pytnl/tnl/EntityTypes.h deleted file mode 100644 index 1f10e2827..000000000 --- a/src/Python/pytnl/tnl/EntityTypes.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include -namespace py = pybind11; - -enum class EntityTypes { Cell, Face, Vertex }; - -inline void -export_EntityTypes( py::module & m ) -{ - // avoid duplicate conversion -> export only once - static bool exported = false; - if( ! exported ) { - // TODO: should be nested types instead - py::enum_< EntityTypes >( m, "EntityTypes" ) - .value("Cell", EntityTypes::Cell) - .value("Face", EntityTypes::Face) - .value("Vertex", EntityTypes::Vertex) - ; - exported = true; - } -} - -template< typename Mesh > -typename Mesh::GlobalIndexType -mesh_getEntitiesCount( const Mesh & self, const EntityTypes & entity ) -{ - if( entity == EntityTypes::Cell ) - return self.template getEntitiesCount< typename Mesh::Cell >(); - else if( entity == EntityTypes::Face ) - return self.template getEntitiesCount< typename Mesh::Face >(); - else if( entity == EntityTypes::Vertex ) - return self.template getEntitiesCount< typename Mesh::Vertex >(); - else - throw py::value_error("The entity parameter must be either Cell, Face or Vertex."); -} diff --git a/src/Python/pytnl/tnl/Grid.h b/src/Python/pytnl/tnl/Grid.h index 8cf28a8f5..2622bd5c9 100644 --- a/src/Python/pytnl/tnl/Grid.h +++ b/src/Python/pytnl/tnl/Grid.h @@ -5,7 +5,7 @@ namespace py = pybind11; #include "StaticVector.h" #include "Grid_getSpaceStepsProducts.h" -#include "EntityTypes.h" +#include "mesh_getters.h" #include @@ -54,8 +54,6 @@ void export_Grid( py::module & m, const char* name ) // void (Grid::* _setDimensions1)(const IndexType) = &Grid::setDimensions; void (Grid::* _setDimensions2)(const typename Grid::CoordinatesType &) = &Grid::setDimensions; - export_EntityTypes(m); - auto grid = py::class_( m, name ) .def(py::init<>()) .def_static("getMeshDimension", &Grid::getMeshDimension) @@ -68,11 +66,13 @@ void export_Grid( py::module & m, const char* name ) .def("setDomain", &Grid::setDomain) .def("getOrigin", &Grid::getOrigin, py::return_value_policy::reference_internal) .def("getProportions", &Grid::getProportions, py::return_value_policy::reference_internal) - .def("getEntitiesCount", &mesh_getEntitiesCount< Grid >) - // TODO: if combined, the return type would depend on the runtime parameter (entity) - .def("getEntity_cell", &Grid::template getEntity) - .def("getEntity_face", &Grid::template getEntity) - .def("getEntity_vertex", &Grid::template getEntity) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Cell >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Face >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Vertex >) + // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity) + .def("getCell", &Grid::template getEntity) + .def("getFace", &Grid::template getEntity) + .def("getVertex", &Grid::template getEntity) .def("getEntityIndex", &Grid::template getEntityIndex) .def("getEntityIndex", &Grid::template getEntityIndex) .def("getEntityIndex", &Grid::template getEntityIndex) diff --git a/src/Python/pytnl/tnl/Mesh.h b/src/Python/pytnl/tnl/Mesh.h index 21fa015fc..3097f111f 100644 --- a/src/Python/pytnl/tnl/Mesh.h +++ b/src/Python/pytnl/tnl/Mesh.h @@ -5,7 +5,7 @@ namespace py = pybind11; #include "../typedefs.h" #include "StaticVector.h" -#include "EntityTypes.h" +#include "mesh_getters.h" #include #include @@ -82,8 +82,11 @@ template< typename MeshEntity, typename Scope > void export_MeshEntity( Scope & scope, const char* name ) { auto entity = py::class_< MeshEntity >( scope, name ) +// .def(py::init<>()) +// .def(py::init()) .def_static("getEntityDimension", &MeshEntity::getEntityDimension) .def("getIndex", &MeshEntity::getIndex) + .def("getTag", &MeshEntity::getTag) // TODO ; @@ -95,23 +98,24 @@ void export_MeshEntity( Scope & scope, const char* name ) template< typename Mesh > void export_Mesh( py::module & m, const char* name ) { - // there are two templates - const and non-const - take only the const - auto (Mesh::* getEntity_cell)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - auto (Mesh::* getEntity_face)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - auto (Mesh::* getEntity_vertex)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - - export_EntityTypes(m); - auto mesh = py::class_< Mesh, TNL::Object >( m, name ) .def(py::init<>()) .def_static("getMeshDimension", &Mesh::getMeshDimension) .def_static("getSerializationType", &Mesh::getSerializationType) .def("getSerializationTypeVirtual", &Mesh::getSerializationTypeVirtual) - .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh >) - // TODO: if combined, the return type would depend on the runtime parameter (entity) - .def("getEntity_cell", getEntity_cell) - .def("getEntity_face", getEntity_face) - .def("getEntity_vertex", getEntity_vertex) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Cell >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Face >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Vertex >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Cell >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Face >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Vertex >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Cell >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Face >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Vertex >) + // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity) + .def("getCell", &Mesh::template getEntity) + .def("getFace", &Mesh::template getEntity) + .def("getVertex", &Mesh::template getEntity) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ return getEntityCenter( mesh, cell ); } ) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Face& face ){ return getEntityCenter( mesh, face ); } ) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return getEntityCenter( mesh, vertex ); } ) @@ -124,6 +128,12 @@ void export_Mesh( py::module & m, const char* name ) return mesh.template isBoundaryEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } ) .def("isBoundaryEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return mesh.template isBoundaryEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ + return mesh.template isGhostEntity< Mesh::Cell::getEntityDimension() >( cell.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Face& face ){ + return mesh.template isGhostEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ + return mesh.template isGhostEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } ) // TODO: more? ; diff --git a/src/Python/pytnl/tnl/mesh_getters.h b/src/Python/pytnl/tnl/mesh_getters.h new file mode 100644 index 000000000..c5eddaa5e --- /dev/null +++ b/src/Python/pytnl/tnl/mesh_getters.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getEntitiesCount( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getEntitiesCount< EntityType::getEntityDimension() >(); +} + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getGhostEntitiesCount( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getGhostEntitiesCount< EntityType::getEntityDimension() >(); +} + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getGhostEntitiesOffset( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getGhostEntitiesOffset< EntityType::getEntityDimension() >(); +} -- GitLab From 40cd3071254dd505d3406423d6085348c4ccdfdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 14 Nov 2020 11:43:02 +0100 Subject: [PATCH 18/50] Refactored BiCGStab, added restarting --- src/TNL/Solvers/Linear/BICGStab.h | 4 + src/TNL/Solvers/Linear/BICGStab_impl.h | 138 +++++++++++++------------ 2 files changed, 74 insertions(+), 68 deletions(-) diff --git a/src/TNL/Solvers/Linear/BICGStab.h b/src/TNL/Solvers/Linear/BICGStab.h index 2cede824a..474a45d02 100644 --- a/src/TNL/Solvers/Linear/BICGStab.h +++ b/src/TNL/Solvers/Linear/BICGStab.h @@ -37,6 +37,10 @@ public: bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: + void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); + + void preconditioned_matvec( ConstVectorViewType src, VectorViewType dst ); + void setSize( const VectorViewType& x ); bool exact_residue = false; diff --git a/src/TNL/Solvers/Linear/BICGStab_impl.h b/src/TNL/Solvers/Linear/BICGStab_impl.h index baa4b6363..ff3b42ed0 100644 --- a/src/TNL/Solvers/Linear/BICGStab_impl.h +++ b/src/TNL/Solvers/Linear/BICGStab_impl.h @@ -38,111 +38,80 @@ setup( const Config::ParameterContainer& parameters, } template< typename Matrix > -bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x ) +bool +BICGStab< Matrix >:: +solve( ConstVectorViewType b, VectorViewType x ) { this->setSize( x ); - RealType alpha, beta, omega, aux, rho, rho_old, b_norm; + RealType alpha, beta, omega, rho, rho_old, b_norm, r_ast_sqnorm; + // initialize the norm of the preconditioned right-hand-side if( this->preconditioner ) { this->preconditioner->solve( b, M_tmp ); b_norm = lpNorm( M_tmp, 2.0 ); - - this->matrix->vectorProduct( x, M_tmp ); - M_tmp = b - M_tmp; - this->preconditioner->solve( M_tmp, r ); } - else { + else b_norm = lpNorm( b, 2.0 ); - this->matrix->vectorProduct( x, r ); - r = b - r; - } + if( b_norm == 0.0 ) + b_norm = 1.0; + + // r = M.solve(b - A * x); + compute_residue( r, x, b ); p = r_ast = r; s.setValue( 0.0 ); - rho = (r, r_ast); + r_ast_sqnorm = rho = (r, r_ast); - if( b_norm == 0.0 ) - b_norm = 1.0; + const RealType eps2 = std::numeric_limits::epsilon() * std::numeric_limits::epsilon(); this->resetIterations(); this->setResidue( std::sqrt( rho ) / b_norm ); while( this->nextIteration() ) { - /**** - * alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 ) - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( p, M_tmp ); - this->preconditioner->solve( M_tmp, Ap ); - } - else { - this->matrix->vectorProduct( p, Ap ); - } - aux = (Ap, r_ast); - alpha = rho / aux; + // alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 ) + preconditioned_matvec( p, Ap ); + alpha = rho / (Ap, r_ast); - /**** - * s_j = r_j - alpha_j * A p_j - */ + // s_j = r_j - alpha_j * A p_j s = r - alpha * Ap; - /**** - * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j ) - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( s, M_tmp ); - this->preconditioner->solve( M_tmp, As ); - } - else { - this->matrix->vectorProduct( s, As ); - } - aux = lpNorm( As, 2.0 ); - omega = (As, s) / (aux * aux); + // omega_j = ( A s_j, s_j ) / ( A s_j, A s_j ) + preconditioned_matvec( s, As ); + omega = (As, s) / (As, As); - /**** - * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j - */ + // x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j x += alpha * p + omega * s; - /**** - * r_{j+1} = s_j - omega_j * A s_j - */ + // r_{j+1} = s_j - omega_j * A s_j r = s - omega * As; - /**** - * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 ) - */ + // compute scalar product of the residual vectors rho_old = rho; rho = (r, r_ast); + if( abs(rho) < eps2 * r_ast_sqnorm ) { + // The new residual vector has become too orthogonal to the arbitrarily chosen direction r_ast. + // Let's restart with a new r0: + compute_residue( r, x, b ); + r_ast = r; + r_ast_sqnorm = rho = (r, r_ast); + } + + // beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 ) beta = (rho / rho_old) * (alpha / omega); - /**** - * p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j ) - */ + // p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j ) p = r + beta * p - (beta * omega) * Ap; if( exact_residue ) { - /**** - * Compute the exact preconditioned residue into the 's' vector. - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( x, M_tmp ); - M_tmp = b - M_tmp; - this->preconditioner->solve( M_tmp, s ); - } - else { - this->matrix->vectorProduct( x, s ); - s = b - s; - } + // Compute the exact preconditioned residue into the 's' vector. + compute_residue( s, x, b ); const RealType residue = lpNorm( s, 2.0 ); this->setResidue( residue / b_norm ); } else { - /**** - * Use the "orthogonal residue vector" for stopping. - */ + // Use the "orthogonal residue vector" for stopping. const RealType residue = lpNorm( r, 2.0 ); this->setResidue( residue / b_norm ); } @@ -153,7 +122,40 @@ bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x ) } template< typename Matrix > -void BICGStab< Matrix > :: setSize( const VectorViewType& x ) +void +BICGStab< Matrix >:: +compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ) +{ + // r = M.solve(b - A * x); + if( this->preconditioner ) { + this->matrix->vectorProduct( x, M_tmp ); + M_tmp = b - M_tmp; + this->preconditioner->solve( M_tmp, r ); + } + else { + this->matrix->vectorProduct( x, r ); + r = b - r; + } +} + +template< typename Matrix > +void +BICGStab< Matrix >:: +preconditioned_matvec( ConstVectorViewType src, VectorViewType dst ) +{ + if( this->preconditioner ) { + this->matrix->vectorProduct( src, M_tmp ); + this->preconditioner->solve( M_tmp, dst ); + } + else { + this->matrix->vectorProduct( src, dst ); + } +} + +template< typename Matrix > +void +BICGStab< Matrix >:: +setSize( const VectorViewType& x ) { r.setLike( x ); r_ast.setLike( x ); -- GitLab From 4cf3545497fd6321351c8bfd4404d17fa69c79e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 16 Nov 2020 22:44:27 +0100 Subject: [PATCH 19/50] Refactoring DistributedArray - implementation via DistributedArrayView as a data member --- src/TNL/Containers/DistributedArray.h | 4 +- src/TNL/Containers/DistributedArray.hpp | 83 ++++++--------------- src/TNL/Containers/DistributedArrayView.h | 9 ++- src/TNL/Containers/DistributedArrayView.hpp | 18 +++++ 4 files changed, 46 insertions(+), 68 deletions(-) diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h index 66dd8a8f0..31fc6d8a8 100644 --- a/src/TNL/Containers/DistributedArray.h +++ b/src/TNL/Containers/DistributedArray.h @@ -168,9 +168,7 @@ public: // TODO: serialization (save, load) protected: - LocalRangeType localRange; - IndexType globalSize = 0; - CommunicationGroup group = Communicator::NullGroup; + ViewType view; LocalArrayType localData; }; diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp index c146bbf9f..4910cbcd7 100644 --- a/src/TNL/Containers/DistributedArray.hpp +++ b/src/TNL/Containers/DistributedArray.hpp @@ -39,11 +39,9 @@ DistributedArray< Value, Device, Index, Communicator >:: setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group ) { TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" ); - this->localRange = localRange; - this->globalSize = globalSize; - this->group = group; if( group != Communicator::NullGroup ) localData.setSize( localRange.getSize() ); + view.bind( localRange, globalSize, group, localData.getView() ); } template< typename Value, @@ -54,7 +52,7 @@ const Subrange< Index >& DistributedArray< Value, Device, Index, Communicator >:: getLocalRange() const { - return localRange; + return view.getLocalRange(); } template< typename Value, @@ -65,7 +63,7 @@ typename Communicator::CommunicationGroup DistributedArray< Value, Device, Index, Communicator >:: getCommunicationGroup() const { - return group; + return view.getCommunicationGroup(); } template< typename Value, @@ -99,18 +97,7 @@ void DistributedArray< Value, Device, Index, Communicator >:: copyFromGlobal( ConstLocalViewType globalArray ) { - TNL_ASSERT_EQ( getSize(), globalArray.getSize(), - "given global array has different size than the distributed array" ); - - LocalViewType localView( localData ); - const LocalRangeType localRange = getLocalRange(); - - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - localView[ i ] = globalArray[ localRange.getGlobalIndex( i ) ]; - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel ); + view.copyFromGlobal( globalArray ); } @@ -126,7 +113,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::ViewType DistributedArray< Value, Device, Index, Communicator >:: getView() { - return ViewType( getLocalRange(), getSize(), getCommunicationGroup(), getLocalView() ); + return view; } template< typename Value, @@ -137,7 +124,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType DistributedArray< Value, Device, Index, Communicator >:: getConstView() const { - return ConstViewType( getLocalRange(), getSize(), getCommunicationGroup(), getConstLocalView() ); + return view.getConstView(); } template< typename Value, @@ -169,10 +156,8 @@ void DistributedArray< Value, Device, Index, Communicator >:: setLike( const Array& array ) { - localRange = array.getLocalRange(); - globalSize = array.getSize(); - group = array.getCommunicationGroup(); localData.setLike( array.getConstLocalView() ); + view.bind( array.getLocalRange(), array.getSize(), array.getCommunicationGroup(), localData.getView() ); } template< typename Value, @@ -183,9 +168,7 @@ void DistributedArray< Value, Device, Index, Communicator >:: reset() { - localRange.reset(); - globalSize = 0; - group = Communicator::NullGroup; + view.reset(); localData.reset(); } @@ -197,7 +180,7 @@ bool DistributedArray< Value, Device, Index, Communicator >:: empty() const { - return getSize() == 0; + return view.empty(); } template< typename Value, @@ -208,7 +191,7 @@ Index DistributedArray< Value, Device, Index, Communicator >:: getSize() const { - return globalSize; + return view.getSize(); } template< typename Value, @@ -219,7 +202,7 @@ void DistributedArray< Value, Device, Index, Communicator >:: setValue( ValueType value ) { - localData.setValue( value ); + view.setValue( value ); } template< typename Value, @@ -230,8 +213,7 @@ void DistributedArray< Value, Device, Index, Communicator >:: setElement( IndexType i, ValueType value ) { - const IndexType li = localRange.getLocalIndex( i ); - localData.setElement( li, value ); + view.setElement( i, value ); } template< typename Value, @@ -242,8 +224,7 @@ Value DistributedArray< Value, Device, Index, Communicator >:: getElement( IndexType i ) const { - const IndexType li = localRange.getLocalIndex( i ); - return localData.getElement( li ); + return view.getElement( i ); } template< typename Value, @@ -255,8 +236,7 @@ Value& DistributedArray< Value, Device, Index, Communicator >:: operator[]( IndexType i ) { - const IndexType li = localRange.getLocalIndex( i ); - return localData[ li ]; + return view[ i ]; } template< typename Value, @@ -268,8 +248,7 @@ const Value& DistributedArray< Value, Device, Index, Communicator >:: operator[]( IndexType i ) const { - const IndexType li = localRange.getLocalIndex( i ); - return localData[ li ]; + return view[ i ]; } template< typename Value, @@ -281,7 +260,7 @@ DistributedArray< Value, Device, Index, Communicator >:: operator=( const DistributedArray& array ) { setLike( array ); - localData = array.getConstLocalView(); + view = array; return *this; } @@ -295,7 +274,7 @@ DistributedArray< Value, Device, Index, Communicator >:: operator=( const Array& array ) { setLike( array ); - localData = array.getConstLocalView(); + view = array; return *this; } @@ -308,17 +287,7 @@ bool DistributedArray< Value, Device, Index, Communicator >:: operator==( const Array& array ) const { - // we can't run allreduce if the communication groups are different - if( group != array.getCommunicationGroup() ) - return false; - const bool localResult = - localRange == array.getLocalRange() && - globalSize == array.getSize() && - localData == array.getConstLocalView(); - bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); - return result; + return view == array; } template< typename Value, @@ -330,7 +299,7 @@ bool DistributedArray< Value, Device, Index, Communicator >:: operator!=( const Array& array ) const { - return ! (*this == array); + return view != array; } template< typename Value, @@ -341,12 +310,7 @@ bool DistributedArray< Value, Device, Index, Communicator >:: containsValue( ValueType value ) const { - bool result = false; - if( group != CommunicatorType::NullGroup ) { - const bool localResult = localData.containsValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group ); - } - return result; + return view.containsValue( value ); } template< typename Value, @@ -357,12 +321,7 @@ bool DistributedArray< Value, Device, Index, Communicator >:: containsOnlyValue( ValueType value ) const { - bool result = true; - if( group != CommunicatorType::NullGroup ) { - const bool localResult = localData.containsOnlyValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); - } - return result; + return view.containsOnlyValue( value ); } } // namespace Containers diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index e17467bef..345d4c13c 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -74,9 +74,12 @@ public: __cuda_callable__ DistributedArrayView( DistributedArrayView&& ) = default; - // method for rebinding (reinitialization) - // Note that you can also bind directly to Array and other types implicitly - // convertible to ArrayView. + // method for rebinding (reinitialization) to raw data + __cuda_callable__ + void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ); + + // Note that you can also bind directly to DistributedArray and other types implicitly + // convertible to DistributedArrayView. __cuda_callable__ void bind( DistributedArrayView view ); diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 0199229d4..3890764a2 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -31,6 +31,24 @@ DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communi localData( view.getConstLocalView() ) {} +template< typename Value, + typename Device, + typename Index, + typename Communicator > +__cuda_callable__ +void +DistributedArrayView< Value, Device, Index, Communicator >:: +bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) +{ + TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(), + "The local array size does not match the local range of the distributed array." ); + + this->localRange = localRange; + this->globalSize = globalSize; + this->group = group; + this->localData.bind( localData ); +} + template< typename Value, typename Device, typename Index, -- GitLab From 87772050729688183efcbb1e04c903fa84dda6ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 18 Nov 2020 22:42:44 +0100 Subject: [PATCH 20/50] Cleaned up the implementation of DistributedVector --- src/TNL/Containers/DistributedVector.hpp | 76 ++++-------------------- 1 file changed, 13 insertions(+), 63 deletions(-) diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index fa49591e8..b2c7de038 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -48,7 +48,7 @@ typename DistributedVector< Value, Device, Index, Communicator >::ViewType DistributedVector< Value, Device, Index, Communicator >:: getView() { - return ViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getLocalView() ); + return BaseType::getView(); } template< typename Value, @@ -59,7 +59,7 @@ typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType DistributedVector< Value, Device, Index, Communicator >:: getConstView() const { - return ConstViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getConstLocalView() ); + return BaseType::getConstView(); } template< typename Value, @@ -97,9 +97,7 @@ DistributedVector< Real, Device, Index, Communicator >:: operator=( const Vector& vector ) { this->setLike( vector ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = vector.getConstLocalView(); - } + getView() = vector; return *this; } @@ -112,16 +110,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator+=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += vector.getConstLocalView(); - } + getView() += vector; return *this; } @@ -134,16 +123,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator-=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= vector.getConstLocalView(); - } + getView() -= vector; return *this; } @@ -156,16 +136,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator*=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= vector.getConstLocalView(); - } + getView() *= vector; return *this; } @@ -178,16 +149,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator/=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= vector.getConstLocalView(); - } + getView() /= vector; return *this; } @@ -200,9 +162,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = c; - } + getView() = c; return *this; } @@ -215,9 +175,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator+=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += c; - } + getView() += c; return *this; } @@ -230,9 +188,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator-=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= c; - } + getView() -= c; return *this; } @@ -245,9 +201,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator*=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= c; - } + getView() *= c; return *this; } @@ -260,9 +214,7 @@ DistributedVector< Real, Device, Index, Communicator >& DistributedVector< Real, Device, Index, Communicator >:: operator/=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= c; - } + getView() /= c; return *this; } @@ -275,9 +227,7 @@ void DistributedVector< Real, Device, Index, Communicator >:: scan( IndexType begin, IndexType end ) { - if( end == 0 ) - end = this->getSize(); - Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); + getView().template scan< Type >( begin, end ); } } // namespace Containers -- GitLab From 977f08fd46a35aefa7ab5d8368e06ae273c28287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 17 Nov 2020 10:49:47 +0100 Subject: [PATCH 21/50] Removed __cuda_callable__ from methods in DistributedArrayView, DistributedVectorView and DistributedMatrix Distributed data structures are not supposed to be passed to device kernels. Distributed data structures are operated by the host, which uses the device for parallel processing only in the local data structures. --- src/TNL/Containers/DistributedArrayView.h | 13 +------------ src/TNL/Containers/DistributedArrayView.hpp | 5 ----- src/TNL/Containers/DistributedVectorView.h | 4 ---- src/TNL/Containers/DistributedVectorView.hpp | 2 -- src/TNL/Matrices/DistributedMatrix.h | 9 --------- src/TNL/Matrices/DistributedMatrix_impl.h | 9 --------- src/TNL/Solvers/Linear/GMRES_impl.h | 8 ++++---- .../Solvers/Linear/Preconditioners/Diagonal_impl.h | 9 ++++++--- 8 files changed, 11 insertions(+), 48 deletions(-) diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index 345d4c13c..1936c8d58 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -48,7 +48,6 @@ public: // Initialization by raw data - __cuda_callable__ DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) : localRange(localRange), globalSize(globalSize), group(group), localData(localData) { @@ -56,31 +55,23 @@ public: "The local array size does not match the local range of the distributed array." ); } - __cuda_callable__ DistributedArrayView() = default; - // Copy-constructor does shallow copy, so views can be passed-by-value into - // CUDA kernels and they can be captured-by-value in __cuda_callable__ - // lambda functions. - __cuda_callable__ + // Copy-constructor does shallow copy. DistributedArrayView( const DistributedArrayView& ) = default; // "Templated copy-constructor" accepting any cv-qualification of Value template< typename Value_ > - __cuda_callable__ DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& ); // default move-constructor - __cuda_callable__ DistributedArrayView( DistributedArrayView&& ) = default; // method for rebinding (reinitialization) to raw data - __cuda_callable__ void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ); // Note that you can also bind directly to DistributedArray and other types implicitly // convertible to DistributedArrayView. - __cuda_callable__ void bind( DistributedArrayView view ); // binding to local array via raw pointer @@ -91,13 +82,11 @@ public: /** * \brief Returns a modifiable view of the array view. */ - __cuda_callable__ ViewType getView(); /** * \brief Returns a non-modifiable view of the array view. */ - __cuda_callable__ ConstViewType getConstView() const; diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 3890764a2..0a206054c 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -22,7 +22,6 @@ template< typename Value, typename Index, typename Communicator > template< typename Value_ > -__cuda_callable__ DistributedArrayView< Value, Device, Index, Communicator >:: DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view ) : localRange( view.getLocalRange() ), @@ -35,7 +34,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ void DistributedArrayView< Value, Device, Index, Communicator >:: bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) @@ -53,7 +51,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ void DistributedArrayView< Value, Device, Index, Communicator >:: bind( DistributedArrayView view ) @@ -82,7 +79,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType DistributedArrayView< Value, Device, Index, Communicator >:: getView() @@ -94,7 +90,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType DistributedArrayView< Value, Device, Index, Communicator >:: getConstView() const diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index 157a64b94..cb46f59c3 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -58,12 +58,10 @@ public: // In C++14, default constructors cannot be inherited, although Clang // and GCC since version 7.0 inherit them. // https://stackoverflow.com/a/51854172 - __cuda_callable__ DistributedVectorView() = default; // initialization by base class is not a copy constructor so it has to be explicit template< typename Real_ > // template catches both const and non-const qualified Element - __cuda_callable__ DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view ) : BaseType( view ) {} @@ -74,13 +72,11 @@ public: /** * \brief Returns a modifiable view of the array view. */ - __cuda_callable__ ViewType getView(); /** * \brief Returns a non-modifiable view of the array view. */ - __cuda_callable__ ConstViewType getConstView() const; /* diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 70f61979f..0e32343a4 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -44,7 +44,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType DistributedVectorView< Value, Device, Index, Communicator >:: getView() @@ -56,7 +55,6 @@ template< typename Value, typename Device, typename Index, typename Communicator > -__cuda_callable__ typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType DistributedVectorView< Value, Device, Index, Communicator >:: getConstView() const diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h index faa220da6..5731d11ca 100644 --- a/src/TNL/Matrices/DistributedMatrix.h +++ b/src/TNL/Matrices/DistributedMatrix.h @@ -72,16 +72,12 @@ public: void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup ); - __cuda_callable__ const LocalRangeType& getLocalRowRange() const; - __cuda_callable__ CommunicationGroup getCommunicationGroup() const; - __cuda_callable__ const Matrix& getLocalMatrix() const; - __cuda_callable__ Matrix& getLocalMatrix(); @@ -99,10 +95,8 @@ public: void reset(); - __cuda_callable__ IndexType getRows() const; - __cuda_callable__ IndexType getColumns() const; template< typename RowCapacitiesVector > @@ -120,14 +114,11 @@ public: RealType getElement( IndexType row, IndexType column ) const; - __cuda_callable__ RealType getElementFast( IndexType row, IndexType column ) const; - __cuda_callable__ MatrixRow getRow( IndexType row ); - __cuda_callable__ ConstMatrixRow getRow( IndexType row ) const; // multiplication with a global vector diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h index 806703ca6..40a675f1a 100644 --- a/src/TNL/Matrices/DistributedMatrix_impl.h +++ b/src/TNL/Matrices/DistributedMatrix_impl.h @@ -42,7 +42,6 @@ setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns template< typename Matrix, typename Communicator > -__cuda_callable__ const Containers::Subrange< typename Matrix::IndexType >& DistributedMatrix< Matrix, Communicator >:: getLocalRowRange() const @@ -52,7 +51,6 @@ getLocalRowRange() const template< typename Matrix, typename Communicator > -__cuda_callable__ typename Communicator::CommunicationGroup DistributedMatrix< Matrix, Communicator >:: getCommunicationGroup() const @@ -62,7 +60,6 @@ getCommunicationGroup() const template< typename Matrix, typename Communicator > -__cuda_callable__ const Matrix& DistributedMatrix< Matrix, Communicator >:: getLocalMatrix() const @@ -72,7 +69,6 @@ getLocalMatrix() const template< typename Matrix, typename Communicator > -__cuda_callable__ Matrix& DistributedMatrix< Matrix, Communicator >:: getLocalMatrix() @@ -139,7 +135,6 @@ reset() template< typename Matrix, typename Communicator > -__cuda_callable__ typename Matrix::IndexType DistributedMatrix< Matrix, Communicator >:: getRows() const @@ -149,7 +144,6 @@ getRows() const template< typename Matrix, typename Communicator > -__cuda_callable__ typename Matrix::IndexType DistributedMatrix< Matrix, Communicator >:: getColumns() const @@ -224,7 +218,6 @@ getElement( IndexType row, template< typename Matrix, typename Communicator > -__cuda_callable__ typename Matrix::RealType DistributedMatrix< Matrix, Communicator >:: getElementFast( IndexType row, @@ -236,7 +229,6 @@ getElementFast( IndexType row, template< typename Matrix, typename Communicator > -__cuda_callable__ typename DistributedMatrix< Matrix, Communicator >::MatrixRow DistributedMatrix< Matrix, Communicator >:: getRow( IndexType row ) @@ -247,7 +239,6 @@ getRow( IndexType row ) template< typename Matrix, typename Communicator > -__cuda_callable__ typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow DistributedMatrix< Matrix, Communicator >:: getRow( IndexType row ) const diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 02a122a5d..23b563940 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -477,20 +477,20 @@ hauseholder_generate( const int i, ConstVectorViewType z ) { // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization + ConstDeviceView z_local = Traits::getConstLocalView( z ); + DeviceView y_i_local = Traits::getLocalView( y_i ); if( localOffset == 0 ) { TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" ); auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable { if( j < i ) - y_i[ j ] = 0.0; + y_i_local[ j ] = 0.0; else - y_i[ j ] = z[ j ]; + y_i_local[ j ] = z_local[ j ]; }; Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation ); } else { - ConstDeviceView z_local = Traits::getConstLocalView( z ); - DeviceView y_i_local = Traits::getLocalView( y_i ); y_i_local = z_local; } diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index 788fc228d..f30151548 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -71,12 +71,15 @@ update( const MatrixPointer& matrixPointer ) diagonal.setSize( matrixPointer->getLocalMatrix().getRows() ); LocalViewType diag_view( diagonal ); - const MatrixType* kernel_matrix = &matrixPointer.template getData< DeviceType >(); + // FIXME: SparseMatrix::getConstView is broken +// const auto matrix_view = matrixPointer->getLocalMatrix().getConstView(); + const auto matrix_view = matrixPointer->getLocalMatrix().getView(); + const auto row_range = matrixPointer->getLocalRowRange(); auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable { - const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i ); - diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi ); + const IndexType gi = row_range.getGlobalIndex( i ); + diag_view[ i ] = matrix_view.getElement( i, gi ); }; Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); -- GitLab From 98fe52f6fe715c29aa381214501bf403dd5cbfa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 17 Nov 2020 14:27:35 +0100 Subject: [PATCH 22/50] Reordered methods in DistributedArrayView --- src/TNL/Containers/DistributedArrayView.h | 40 ++++---- src/TNL/Containers/DistributedArrayView.hpp | 108 ++++++++++---------- 2 files changed, 72 insertions(+), 76 deletions(-) diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index 1936c8d58..86395517d 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -79,27 +79,6 @@ public: template< typename Value_ > void bind( Value_* data, IndexType localSize ); - /** - * \brief Returns a modifiable view of the array view. - */ - ViewType getView(); - - /** - * \brief Returns a non-modifiable view of the array view. - */ - ConstViewType getConstView() const; - - - // Copy-assignment does deep copy, just like regular array, but the sizes - // must match (i.e. copy-assignment cannot resize). - DistributedArrayView& operator=( const DistributedArrayView& view ); - - template< typename Array, - typename..., - typename = std::enable_if_t< HasSubscriptOperator::value > > - DistributedArrayView& operator=( const Array& array ); - - const LocalRangeType& getLocalRange() const; CommunicationGroup getCommunicationGroup() const; @@ -115,6 +94,16 @@ public: * Usual ArrayView methods follow below. */ + /** + * \brief Returns a modifiable view of the array view. + */ + ViewType getView(); + + /** + * \brief Returns a non-modifiable view of the array view. + */ + ConstViewType getConstView() const; + // Resets the array view to the empty state. void reset(); @@ -143,6 +132,15 @@ public: __cuda_callable__ const ValueType& operator[]( IndexType i ) const; + // Copy-assignment does deep copy, just like regular array, but the sizes + // must match (i.e. copy-assignment cannot resize). + DistributedArrayView& operator=( const DistributedArrayView& view ); + + template< typename Array, + typename..., + typename = std::enable_if_t< HasSubscriptOperator::value > > + DistributedArrayView& operator=( const Array& array ); + // Comparison operators template< typename Array > bool operator==( const Array& array ) const; diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 0a206054c..81583541c 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -75,61 +75,6 @@ bind( Value_* data, IndexType localSize ) localData.bind( data, localSize ); } -template< typename Value, - typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getView() -{ - return *this; -} - -template< typename Value, - typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getConstView() const -{ - return *this; -} - - -template< typename Value, - typename Device, - typename Index, - typename Communicator > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: -operator=( const DistributedArrayView& view ) -{ - TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." ); - localData = view.getConstLocalView(); - return *this; -} - -template< typename Value, - typename Device, - typename Index, - typename Communicator > - template< typename Array, typename..., typename > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: -operator=( const Array& array ) -{ - TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." ); - localData = array.getConstLocalView(); - return *this; -} - - template< typename Value, typename Device, typename Index, @@ -197,6 +142,28 @@ copyFromGlobal( ConstLocalViewType globalArray ) } +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType +DistributedArrayView< Value, Device, Index, Communicator >:: +getView() +{ + return *this; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType +DistributedArrayView< Value, Device, Index, Communicator >:: +getConstView() const +{ + return *this; +} + template< typename Value, typename Device, typename Index, @@ -296,6 +263,37 @@ operator[]( IndexType i ) const return localData[ li ]; } +template< typename Value, + typename Device, + typename Index, + typename Communicator > +DistributedArrayView< Value, Device, Index, Communicator >& +DistributedArrayView< Value, Device, Index, Communicator >:: +operator=( const DistributedArrayView& view ) +{ + TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." ); + localData = view.getConstLocalView(); + return *this; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > + template< typename Array, typename..., typename > +DistributedArrayView< Value, Device, Index, Communicator >& +DistributedArrayView< Value, Device, Index, Communicator >:: +operator=( const Array& array ) +{ + TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." ); + localData = array.getConstLocalView(); + return *this; +} + template< typename Value, typename Device, typename Index, -- GitLab From 5184793e8b71e585ef191b10e54d9e54b551c4dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 17 Nov 2020 13:36:25 +0100 Subject: [PATCH 23/50] Added base class ByteArraySynchronizer --- src/TNL/Containers/ByteArraySynchronizer.h | 32 +++++++++++++++ .../DistributedMeshSynchronizer.h | 39 +++++++++++++------ 2 files changed, 59 insertions(+), 12 deletions(-) create mode 100644 src/TNL/Containers/ByteArraySynchronizer.h diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h new file mode 100644 index 000000000..520820c02 --- /dev/null +++ b/src/TNL/Containers/ByteArraySynchronizer.h @@ -0,0 +1,32 @@ +/*************************************************************************** + ByteArraySynchronizer.h - description + ------------------- + begin : November 17, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +#include + +namespace TNL { +namespace Containers { + +template< typename Device, typename Index > +class ByteArraySynchronizer +{ +public: + using ByteArrayView = ArrayView< std::uint8_t, Device, Index >; + + virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) = 0; + + virtual ~ByteArraySynchronizer() = default; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h index 724510bf4..225d1a2df 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h @@ -12,6 +12,7 @@ #pragma once +#include #include #include @@ -32,11 +33,15 @@ struct HasMeshType< T, typename Containers::Expressions::enable_if_type< typenam template< typename DistributedMesh, int EntityDimension = DistributedMesh::getMeshDimension() > class DistributedMeshSynchronizer +: public Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType > { + using Base = Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >; + public: using DeviceType = typename DistributedMesh::DeviceType; using GlobalIndexType = typename DistributedMesh::GlobalIndexType; using CommunicatorType = typename DistributedMesh::CommunicatorType; + using ByteArrayView = typename Base::ByteArrayView; DistributedMeshSynchronizer() = default; @@ -182,10 +187,20 @@ public: template< typename Array > void synchronizeArray( Array& array, int valuesPerElement = 1 ) { - TNL_ASSERT_EQ( array.getSize(), valuesPerElement * ghostOffsets[ ghostOffsets.getSize() - 1 ], - "The array does not have the expected size." ); + static_assert( std::is_same< typename Array::DeviceType, DeviceType >::value, + "mismatched DeviceType of the array" ); using ValueType = typename Array::ValueType; + ByteArrayView view; + view.bind( reinterpret_cast( array.getData() ), sizeof(ValueType) * array.getSize() ); + synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement ); + } + + virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override + { + TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ], + "The array does not have the expected size." ); + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ #ifdef HAVE_CUDA if( std::is_same< DeviceType, Devices::Cuda >::value ) @@ -196,7 +211,7 @@ public: const int nproc = CommunicatorType::GetSize( group ); // allocate send buffers (setSize does nothing if the array size is already correct) - sendBuffers.setSize( valuesPerElement * ghostNeighborOffsets[ nproc ] * sizeof(ValueType) ); + sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] ); // buffer for asynchronous communication requests std::vector< typename CommunicatorType::Request > requests; @@ -205,20 +220,20 @@ public: for( int j = 0; j < nproc; j++ ) { if( ghostEntitiesCounts( rank, j ) > 0 ) { requests.push_back( CommunicatorType::IRecv( - array.getData() + valuesPerElement * ghostOffsets[ j ], - valuesPerElement * ghostEntitiesCounts( rank, j ), + array.getData() + bytesPerValue * ghostOffsets[ j ], + bytesPerValue * ghostEntitiesCounts( rank, j ), j, 0, group ) ); } } - Containers::ArrayView< ValueType, DeviceType, GlobalIndexType > sendBuffersView; - sendBuffersView.bind( reinterpret_cast( sendBuffers.getData() ), valuesPerElement * ghostNeighborOffsets[ nproc ] ); + ByteArrayView sendBuffersView; + sendBuffersView.bind( sendBuffers.getData(), bytesPerValue * ghostNeighborOffsets[ nproc ] ); const auto ghostNeighborsView = ghostNeighbors.getConstView(); const auto arrayView = array.getConstView(); - auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, valuesPerElement] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable + auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, bytesPerValue] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable { - for( int i = 0; i < valuesPerElement; i++ ) - sendBuffersView[ i + valuesPerElement * (offset + k) ] = arrayView[ i + valuesPerElement * ghostNeighborsView[ offset + k ] ]; + for( int i = 0; i < bytesPerValue; i++ ) + sendBuffersView[ i + bytesPerValue * (offset + k) ] = arrayView[ i + bytesPerValue * ghostNeighborsView[ offset + k ] ]; }; for( int i = 0; i < nproc; i++ ) { @@ -229,8 +244,8 @@ public: // issue async send operation requests.push_back( CommunicatorType::ISend( - sendBuffersView.getData() + valuesPerElement * ghostNeighborOffsets[ i ], - valuesPerElement * ghostEntitiesCounts( i, rank ), + sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ], + bytesPerValue * ghostEntitiesCounts( i, rank ), i, 0, group ) ); } } -- GitLab From f0b42e43af8c031df53673e70c4085a3cbf91ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 18 Nov 2020 22:48:37 +0100 Subject: [PATCH 24/50] Added support for ghost ranges to DistributedArray and DistributedVector and their views --- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 4 +- .../tnl-benchmark-linear-solvers.h | 6 +- src/TNL/Containers/DistributedArray.h | 60 ++++--- src/TNL/Containers/DistributedArray.hpp | 120 ++++++++++++-- src/TNL/Containers/DistributedArrayView.h | 36 ++++- src/TNL/Containers/DistributedArrayView.hpp | 146 ++++++++++++++++-- src/TNL/Containers/DistributedVector.h | 19 ++- src/TNL/Containers/DistributedVector.hpp | 22 +++ src/TNL/Containers/DistributedVectorView.h | 18 +++ src/TNL/Containers/DistributedVectorView.hpp | 63 +++++++- .../Expressions/DistributedComparison.h | 6 + .../DistributedExpressionTemplates.h | 146 +++++++++++++++++- src/TNL/Containers/Partitioner.h | 66 +++++++- src/TNL/Matrices/DistributedMatrix_impl.h | 2 +- .../Containers/DistributedArrayTest.h | 80 ++++++++-- .../Containers/DistributedVectorTest.h | 14 +- .../Containers/VectorBinaryOperationsTest.h | 24 ++- .../Containers/VectorHelperFunctions.h | 14 +- .../Containers/VectorUnaryOperationsTest.h | 35 ++++- .../Containers/VectorVerticalOperationsTest.h | 14 +- .../Matrices/DistributedMatrixTest.h | 8 +- 21 files changed, 792 insertions(+), 111 deletions(-) diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index b791b0100..74a3205d3 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -228,10 +228,10 @@ struct SpmvBenchmark const auto group = CommunicatorType::AllGroup; const auto localRange = Partitioner::splitRange( matrix.getRows(), group ); DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group ); - DistributedVector distributedVector( localRange, matrix.getRows(), group ); + DistributedVector distributedVector( localRange, 0, matrix.getRows(), group ); // copy the row lengths from the global matrix to the distributed matrix - DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group ); + DistributedRowLengths distributedRowLengths( localRange, 0, matrix.getRows(), group ); for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) { const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i ); distributedRowLengths[ gi ] = matrix.getRowCapacity( gi ); diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index cadb5a046..06ba2bc94 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -435,11 +435,11 @@ struct LinearSolversBenchmark const auto group = CommunicatorType::AllGroup; const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group ); SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group ); - DistributedVector dist_x0( localRange, matrixPointer->getRows(), group ); - DistributedVector dist_b( localRange, matrixPointer->getRows(), group ); + DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group ); + DistributedVector dist_b( localRange, 0, matrixPointer->getRows(), group ); // copy the row capacities from the global matrix to the distributed matrix - DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group ); + DistributedRowLengths distributedRowLengths( localRange, 0, matrixPointer->getRows(), group ); for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) { const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i ); distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi ); diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h index 31fc6d8a8..c1571bc9e 100644 --- a/src/TNL/Containers/DistributedArray.h +++ b/src/TNL/Containers/DistributedArray.h @@ -37,6 +37,7 @@ public: using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using SynchronizerType = typename ViewType::SynchronizerType; /** * \brief A template which allows to quickly obtain a \ref DistributedArray type with changed template parameters. @@ -50,46 +51,54 @@ public: DistributedArray() = default; - DistributedArray( const DistributedArray& ) = default; + // Copy-constructor does deep copy. + DistributedArray( const DistributedArray& ); - DistributedArray( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); - void setDistribution( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); const LocalRangeType& getLocalRange() const; + IndexType getGhosts() const; + CommunicationGroup getCommunicationGroup() const; /** * \brief Returns a modifiable view of the local part of the array. - * - * If \e begin or \e end is set to a non-zero value, a view for the - * sub-interval `[begin, end)` is returned. Otherwise a view for whole - * local part of the array view is returned. - * - * \param begin The beginning of the array view sub-interval. It is 0 by - * default. - * \param end The end of the array view sub-interval. The default value is 0 - * which is, however, replaced with the array size. */ LocalViewType getLocalView(); /** * \brief Returns a non-modifiable view of the local part of the array. - * - * If \e begin or \e end is set to a non-zero value, a view for the - * sub-interval `[begin, end)` is returned. Otherwise a view for whole - * local part of the array view is returned. - * - * \param begin The beginning of the array view sub-interval. It is 0 by - * default. - * \param end The end of the array view sub-interval. The default value is 0 - * which is, however, replaced with the array size. */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the array, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the array, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + void copyFromGlobal( ConstLocalViewType globalArray ); + // synchronizer stuff + void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 ); + + std::shared_ptr< SynchronizerType > getSynchronizer() const; + + int getValuesPerElement() const; + + void startSynchronization(); + + void waitForSynchronization() const; + // Usual Array methods follow below. @@ -170,6 +179,15 @@ public: protected: ViewType view; LocalArrayType localData; + +private: + template< typename Array, std::enable_if_t< std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true > + static void setSynchronizerHelper( ViewType& view, const Array& array ) + { + view.setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() ); + } + template< typename Array, std::enable_if_t< ! std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true > + static void setSynchronizerHelper( ViewType& view, const Array& array ) {} }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp index 4910cbcd7..c23d0a7e4 100644 --- a/src/TNL/Containers/DistributedArray.hpp +++ b/src/TNL/Containers/DistributedArray.hpp @@ -25,9 +25,20 @@ template< typename Value, typename Index, typename Communicator > DistributedArray< Value, Device, Index, Communicator >:: -DistributedArray( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group ) +DistributedArray( const DistributedArray& array ) { - setDistribution( localRange, globalSize, group ); + setLike( array ); + localData = array.getConstLocalViewWithGhosts(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group ) +{ + setDistribution( localRange, ghosts, globalSize, group ); } template< typename Value, @@ -36,12 +47,12 @@ template< typename Value, typename Communicator > void DistributedArray< Value, Device, Index, Communicator >:: -setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group ) +setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group ) { TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" ); if( group != Communicator::NullGroup ) - localData.setSize( localRange.getSize() ); - view.bind( localRange, globalSize, group, localData.getView() ); + localData.setSize( localRange.getSize() + ghosts ); + view.bind( localRange, ghosts, globalSize, group, localData.getView() ); } template< typename Value, @@ -55,6 +66,17 @@ getLocalRange() const return view.getLocalRange(); } +template< typename Value, + typename Device, + typename Index, + typename Communicator > +Index +DistributedArray< Value, Device, Index, Communicator >:: +getGhosts() const +{ + return view.getGhosts(); +} + template< typename Value, typename Device, typename Index, @@ -74,7 +96,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType DistributedArray< Value, Device, Index, Communicator >:: getLocalView() { - return localData.getView(); + return view.getLocalView(); } template< typename Value, @@ -85,7 +107,29 @@ typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewT DistributedArray< Value, Device, Index, Communicator >:: getConstLocalView() const { - return localData.getConstView(); + return view.getConstLocalView(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType +DistributedArray< Value, Device, Index, Communicator >:: +getLocalViewWithGhosts() +{ + return view.getLocalViewWithGhosts(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType +DistributedArray< Value, Device, Index, Communicator >:: +getConstLocalViewWithGhosts() const +{ + return view.getConstLocalViewWithGhosts(); } @@ -100,6 +144,61 @@ copyFromGlobal( ConstLocalViewType globalArray ) view.copyFromGlobal( globalArray ); } +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArray< Value, Device, Index, Communicator >:: +setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) +{ + view.setSynchronizer( synchronizer, valuesPerElement ); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType > +DistributedArray< Value, Device, Index, Communicator >:: +getSynchronizer() const +{ + return view.getSynchronizer(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +int +DistributedArray< Value, Device, Index, Communicator >:: +getValuesPerElement() const +{ + return view.getValuesPerElement(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArray< Value, Device, Index, Communicator >:: +startSynchronization() +{ + view.startSynchronization(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArray< Value, Device, Index, Communicator >:: +waitForSynchronization() const +{ + view.waitForSynchronization(); +} + /* * Usual Array methods follow below. @@ -156,8 +255,11 @@ void DistributedArray< Value, Device, Index, Communicator >:: setLike( const Array& array ) { - localData.setLike( array.getConstLocalView() ); - view.bind( array.getLocalRange(), array.getSize(), array.getCommunicationGroup(), localData.getView() ); + localData.setLike( array.getConstLocalViewWithGhosts() ); + view.bind( array.getLocalRange(), array.getGhosts(), array.getSize(), array.getCommunicationGroup(), localData.getView() ); + // set, but do not unset, the synchronizer + if( array.getSynchronizer() ) + setSynchronizerHelper( view, array ); } template< typename Value, diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index 86395517d..bf63f8cc6 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -12,9 +12,12 @@ #pragma once +#include + #include #include #include +#include namespace TNL { namespace Containers { @@ -36,6 +39,7 @@ public: using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >; /** * \brief A template which allows to quickly obtain a \ref DistributedArrayView type with changed template parameters. @@ -48,11 +52,12 @@ public: // Initialization by raw data - DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) - : localRange(localRange), globalSize(globalSize), group(group), localData(localData) + DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) + : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData) { - TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(), + TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); + TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." ); } DistributedArrayView() = default; @@ -68,27 +73,44 @@ public: DistributedArrayView( DistributedArrayView&& ) = default; // method for rebinding (reinitialization) to raw data - void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ); + void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ); // Note that you can also bind directly to DistributedArray and other types implicitly // convertible to DistributedArrayView. void bind( DistributedArrayView view ); // binding to local array via raw pointer - // (local range, global size and communication group are preserved) + // (local range, ghosts, global size and communication group are preserved) template< typename Value_ > void bind( Value_* data, IndexType localSize ); const LocalRangeType& getLocalRange() const; + IndexType getGhosts() const; + CommunicationGroup getCommunicationGroup() const; LocalViewType getLocalView(); ConstLocalViewType getConstLocalView() const; + LocalViewType getLocalViewWithGhosts(); + + ConstLocalViewType getConstLocalViewWithGhosts() const; + void copyFromGlobal( ConstLocalViewType globalArray ); + // synchronizer stuff + void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 ); + + std::shared_ptr< SynchronizerType > getSynchronizer() const; + + int getValuesPerElement() const; + + void startSynchronization(); + + void waitForSynchronization() const; + /* * Usual ArrayView methods follow below. @@ -156,9 +178,13 @@ public: protected: LocalRangeType localRange; + IndexType ghosts = 0; IndexType globalSize = 0; CommunicationGroup group = Communicator::NullGroup; LocalViewType localData; + + std::shared_ptr< SynchronizerType > synchronizer = nullptr; + int valuesPerElement = 1; }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 81583541c..cb95427fc 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -25,9 +25,12 @@ template< typename Value, DistributedArrayView< Value, Device, Index, Communicator >:: DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view ) : localRange( view.getLocalRange() ), + ghosts( view.getGhosts() ), globalSize( view.getSize() ), group( view.getCommunicationGroup() ), - localData( view.getConstLocalView() ) + localData( view.getConstLocalViewWithGhosts() ), + synchronizer( view.getSynchronizer() ), + valuesPerElement( view.getValuesPerElement() ) {} template< typename Value, @@ -36,12 +39,14 @@ template< typename Value, typename Communicator > void DistributedArrayView< Value, Device, Index, Communicator >:: -bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) +bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) { - TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(), + TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); + TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." ); this->localRange = localRange; + this->ghosts = ghosts; this->globalSize = globalSize; this->group = group; this->localData.bind( localData ); @@ -56,9 +61,13 @@ DistributedArrayView< Value, Device, Index, Communicator >:: bind( DistributedArrayView view ) { localRange = view.getLocalRange(); + ghosts = view.getGhosts(); globalSize = view.getSize(); group = view.getCommunicationGroup(); - localData.bind( view.getLocalView() ); + localData.bind( view.getLocalViewWithGhosts() ); + // set, but do not unset, the synchronizer + if( view.getSynchronizer() ) + setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() ); } template< typename Value, @@ -70,7 +79,7 @@ void DistributedArrayView< Value, Device, Index, Communicator >:: bind( Value_* data, IndexType localSize ) { - TNL_ASSERT_EQ( localSize, localRange.getSize(), + TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); localData.bind( data, localSize ); } @@ -86,6 +95,17 @@ getLocalRange() const return localRange; } +template< typename Value, + typename Device, + typename Index, + typename Communicator > +Index +DistributedArrayView< Value, Device, Index, Communicator >:: +getGhosts() const +{ + return ghosts; +} + template< typename Value, typename Device, typename Index, @@ -105,7 +125,7 @@ typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewTy DistributedArrayView< Value, Device, Index, Communicator >:: getLocalView() { - return localData; + return LocalViewType( localData.getData(), localRange.getSize() ); } template< typename Value, @@ -115,6 +135,28 @@ template< typename Value, typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType DistributedArrayView< Value, Device, Index, Communicator >:: getConstLocalView() const +{ + return ConstLocalViewType( localData.getData(), localRange.getSize() ); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType +DistributedArrayView< Value, Device, Index, Communicator >:: +getLocalViewWithGhosts() +{ + return localData; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType +DistributedArrayView< Value, Device, Index, Communicator >:: +getConstLocalViewWithGhosts() const { return localData; } @@ -130,7 +172,7 @@ copyFromGlobal( ConstLocalViewType globalArray ) TNL_ASSERT_EQ( getSize(), globalArray.getSize(), "given global array has different size than the distributed array view" ); - LocalViewType localView( localData ); + LocalViewType localView = getLocalView(); const LocalRangeType localRange = getLocalRange(); auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable @@ -139,6 +181,78 @@ copyFromGlobal( ConstLocalViewType globalArray ) }; Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel ); + startSynchronization(); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArrayView< Value, Device, Index, Communicator >:: +setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) +{ + this->synchronizer = synchronizer; + this->valuesPerElement = valuesPerElement; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType > +DistributedArrayView< Value, Device, Index, Communicator >:: +getSynchronizer() const +{ + return synchronizer; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +int +DistributedArrayView< Value, Device, Index, Communicator >:: +getValuesPerElement() const +{ + return valuesPerElement; +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArrayView< Value, Device, Index, Communicator >:: +startSynchronization() +{ + if( ghosts == 0 ) + return; + // TODO: assert does not play very nice with automatic synchronizations from operations like + // assignment of scalars + // (Maybe we should just drop all automatic syncs? But that's not nice for high-level codes + // like linear solvers...) + TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" ); + + // wait for any previous synchronization (in case the array was inconsistently modified + // while a synchronization was in progress) + waitForSynchronization(); + + typename SynchronizerType::ByteArrayView bytes; + bytes.bind( reinterpret_cast( localData.getData() ), sizeof(ValueType) * localData.getSize() ); + // TODO: implement the async stuff + synchronizer->synchronizeByteArray( bytes, sizeof(ValueType) * valuesPerElement ); +} + +template< typename Value, + typename Device, + typename Index, + typename Communicator > +void +DistributedArrayView< Value, Device, Index, Communicator >:: +waitForSynchronization() const +{ + // TODO: implement the async stuff } @@ -173,6 +287,7 @@ DistributedArrayView< Value, Device, Index, Communicator >:: reset() { localRange.reset(); + ghosts = 0; globalSize = 0; group = Communicator::NullGroup; localData.reset(); @@ -211,6 +326,7 @@ DistributedArrayView< Value, Device, Index, Communicator >:: setValue( ValueType value ) { localData.setValue( value ); + startSynchronization(); } template< typename Value, @@ -273,8 +389,12 @@ operator=( const DistributedArrayView& view ) { TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." ); - localData = view.getConstLocalView(); + localData = view.getConstLocalViewWithGhosts(); + // set, but do not unset, the synchronizer + if( view.getSynchronizer() ) + setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() ); return *this; } @@ -289,8 +409,12 @@ operator=( const Array& array ) { TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." ); - localData = array.getConstLocalView(); + localData = array.getConstLocalViewWithGhosts(); + // set, but do not unset, the synchronizer + if( array.getSynchronizer() ) + setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() ); return *this; } @@ -308,8 +432,10 @@ operator==( const Array& array ) const return false; const bool localResult = localRange == array.getLocalRange() && + ghosts == array.getGhosts() && globalSize == array.getSize() && - localData == array.getConstLocalView(); + // compare without ghosts + getConstLocalView() == array.getConstLocalView(); bool result = true; if( group != CommunicatorType::NullGroup ) CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h index 5d5f8303f..32dc80125 100644 --- a/src/TNL/Containers/DistributedVector.h +++ b/src/TNL/Containers/DistributedVector.h @@ -75,11 +75,28 @@ public: */ DistributedVector& operator=( DistributedVector&& ) = default; - // we return only the view so that the user cannot resize it + /** + * \brief Returns a modifiable view of the local part of the vector. + */ LocalViewType getLocalView(); + /** + * \brief Returns a non-modifiable view of the local part of the vector. + */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the vector, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the vector, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + /** * \brief Returns a modifiable view of the vector. */ diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index b2c7de038..cbbc763ec 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -40,6 +40,28 @@ getConstLocalView() const return BaseType::getConstLocalView(); } +template< typename Real, + typename Device, + typename Index, + typename Communicator > +typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType +DistributedVector< Real, Device, Index, Communicator >:: +getLocalViewWithGhosts() +{ + return BaseType::getLocalViewWithGhosts(); +} + +template< typename Real, + typename Device, + typename Index, + typename Communicator > +typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType +DistributedVector< Real, Device, Index, Communicator >:: +getConstLocalViewWithGhosts() const +{ + return BaseType::getConstLocalViewWithGhosts(); +} + template< typename Value, typename Device, typename Index, diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index cb46f59c3..6be52d9db 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -65,10 +65,28 @@ public: DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view ) : BaseType( view ) {} + /** + * \brief Returns a modifiable view of the local part of the vector. + */ LocalViewType getLocalView(); + /** + * \brief Returns a non-modifiable view of the local part of the vector. + */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the vector, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the vector, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + /** * \brief Returns a modifiable view of the array view. */ diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 0e32343a4..f1a6fb1e5 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -40,6 +40,28 @@ getConstLocalView() const return BaseType::getConstLocalView(); } +template< typename Real, + typename Device, + typename Index, + typename Communicator > +typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType +DistributedVectorView< Real, Device, Index, Communicator >:: +getLocalViewWithGhosts() +{ + return BaseType::getLocalViewWithGhosts(); +} + +template< typename Real, + typename Device, + typename Index, + typename Communicator > +typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType +DistributedVectorView< Real, Device, Index, Communicator >:: +getConstLocalViewWithGhosts() const +{ + return BaseType::getConstLocalViewWithGhosts(); +} + template< typename Value, typename Device, typename Index, @@ -80,11 +102,16 @@ operator=( const Vector& vector ) "The sizes of the array views must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "The communication groups of the array views must be equal." ); if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = vector.getConstLocalView(); + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() = vector.getConstLocalViewWithGhosts(); } return *this; } @@ -102,11 +129,16 @@ operator+=( const Vector& vector ) "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += vector.getConstLocalView(); + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() += vector.getConstLocalViewWithGhosts(); } return *this; } @@ -124,11 +156,16 @@ operator-=( const Vector& vector ) "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= vector.getConstLocalView(); + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() -= vector.getConstLocalViewWithGhosts(); } return *this; } @@ -146,11 +183,16 @@ operator*=( const Vector& vector ) "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= vector.getConstLocalView(); + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() *= vector.getConstLocalViewWithGhosts(); } return *this; } @@ -168,11 +210,16 @@ operator/=( const Vector& vector ) "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= vector.getConstLocalView(); + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() /= vector.getConstLocalViewWithGhosts(); } return *this; } @@ -188,6 +235,7 @@ operator=( Scalar c ) { if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { getLocalView() = c; + this->startSynchronization(); } return *this; } @@ -203,6 +251,7 @@ operator+=( Scalar c ) { if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { getLocalView() += c; + this->startSynchronization(); } return *this; } @@ -218,6 +267,7 @@ operator-=( Scalar c ) { if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { getLocalView() -= c; + this->startSynchronization(); } return *this; } @@ -233,6 +283,7 @@ operator*=( Scalar c ) { if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { getLocalView() *= c; + this->startSynchronization(); } return *this; } @@ -248,6 +299,7 @@ operator/=( Scalar c ) { if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { getLocalView() /= c; + this->startSynchronization(); } return *this; } @@ -264,6 +316,7 @@ scan( IndexType begin, IndexType end ) if( end == 0 ) end = this->getSize(); Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); + this->startSynchronization(); } } // namespace Containers diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h index 4cecc92bb..1cef0873d 100644 --- a/src/TNL/Containers/Expressions/DistributedComparison.h +++ b/src/TNL/Containers/Expressions/DistributedComparison.h @@ -38,7 +38,9 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getLocalRange() == b.getLocalRange() && + a.getGhosts() == b.getGhosts() && a.getSize() == b.getSize() && + // compare without ghosts a.getConstLocalView() == b.getConstLocalView(); bool result = true; if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) @@ -55,6 +57,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) @@ -70,6 +73,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) @@ -85,6 +89,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) @@ -100,6 +105,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 1802dcc95..25175a467 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include @@ -64,6 +65,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, typename T2::ConstLocalViewType, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." ); @@ -79,13 +81,16 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV "Attempt to mix operands with different sizes." ); TNL_ASSERT_EQ( op1.getLocalRange(), op2.getLocalRange(), "Distributed expressions are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( op1.getGhosts(), op2.getGhosts(), + "Distributed expressions are supported only on vectors which are distributed the same way." ); TNL_ASSERT_EQ( op1.getCommunicationGroup(), op2.getCommunicationGroup(), "Distributed expressions are supported only on vectors within the same communication group." ); } RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -105,6 +110,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getLocalRange(); } + IndexType getGhosts() const + { + return op1.getGhosts(); + } + CommunicationGroup getCommunicationGroup() const { return op1.getCommunicationGroup(); @@ -115,6 +125,27 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return ConstLocalViewType( op1.getConstLocalView(), op2.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op1.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op1.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op1.waitForSynchronization(); + op2.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -132,6 +163,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." ); @@ -141,7 +173,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -161,6 +194,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getLocalRange(); } + IndexType getGhosts() const + { + return op1.getGhosts(); + } + CommunicationGroup getCommunicationGroup() const { return op1.getCommunicationGroup(); @@ -171,6 +209,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return ConstLocalViewType( op1.getConstLocalView(), op2 ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2 ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op1.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op1.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op1.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -188,6 +246,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T2::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >; + using SynchronizerType = typename T2::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T2 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the right operand." ); @@ -197,7 +256,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -217,6 +277,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl return op2.getLocalRange(); } + IndexType getGhosts() const + { + return op2.getGhosts(); + } + CommunicationGroup getCommunicationGroup() const { return op2.getCommunicationGroup(); @@ -227,6 +292,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl return ConstLocalViewType( op1, op2.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1, op2.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op2.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op2.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op2.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -245,6 +330,7 @@ struct DistributedUnaryExpressionTemplate using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed unary expression templates - distributed expression templates are not enabled for the operand." ); @@ -254,7 +340,8 @@ struct DistributedUnaryExpressionTemplate RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -274,6 +361,11 @@ struct DistributedUnaryExpressionTemplate return operand.getLocalRange(); } + IndexType getGhosts() const + { + return operand.getGhosts(); + } + CommunicationGroup getCommunicationGroup() const { return operand.getCommunicationGroup(); @@ -284,6 +376,26 @@ struct DistributedUnaryExpressionTemplate return ConstLocalViewType( operand.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( operand.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return operand.getSynchronizer(); + } + + int getValuesPerElement() const + { + return operand.getValuesPerElement(); + } + + void waitForSynchronization() const + { + operand.waitForSynchronization(); + } + protected: const T1& operand; }; @@ -812,10 +924,19 @@ template< typename T1, typename Operation > std::ostream& operator<<( std::ostream& str, const DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression ) { + const auto localRange = expression.getLocalRange(); str << "[ "; - for( int i = 0; i < expression.getSize() - 1; i++ ) + for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ ) str << expression.getElement( i ) << ", "; - str << expression.getElement( expression.getSize() - 1 ) << " ]"; + str << expression.getElement( localRange.getEnd() - 1 ); + if( expression.getGhosts() > 0 ) { + str << " | "; + const auto localView = expression.getConstLocalViewWithGhosts(); + for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ ) + str << localView.getElement( i ) << ", "; + str << localView.getElement( localView.getSize() - 1 ); + } + str << " ]"; return str; } @@ -823,10 +944,19 @@ template< typename T, typename Operation > std::ostream& operator<<( std::ostream& str, const DistributedUnaryExpressionTemplate< T, Operation >& expression ) { + const auto localRange = expression.getLocalRange(); str << "[ "; - for( int i = 0; i < expression.getSize() - 1; i++ ) + for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ ) str << expression.getElement( i ) << ", "; - str << expression.getElement( expression.getSize() - 1 ) << " ]"; + str << expression.getElement( localRange.getEnd() - 1 ); + if( expression.getGhosts() > 0 ) { + str << " | "; + const auto localView = expression.getConstLocalViewWithGhosts(); + for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ ) + str << localView.getElement( i ) << ", "; + str << localView.getElement( localView.getSize() - 1 ); + } + str << " ]"; return str; } diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h index f0b507475..75e958734 100644 --- a/src/TNL/Containers/Partitioner.h +++ b/src/TNL/Containers/Partitioner.h @@ -12,7 +12,10 @@ #pragma once +#include + #include "Subrange.h" +#include "ByteArraySynchronizer.h" #include @@ -66,13 +69,64 @@ public: const Index end = min( globalSize, (rank + 1) * globalSize / partitions ); return end - begin; } -}; -// TODO: -// - partitioner in deal.II stores also ghost indices: -// https://www.dealii.org/8.4.0/doxygen/deal.II/classUtilities_1_1MPI_1_1Partitioner.html -// - ghost indices are stored in a general IndexMap class (based on collection of subranges): -// https://www.dealii.org/8.4.0/doxygen/deal.II/classIndexSet.html + template< typename Device > + class ArraySynchronizer + : public ByteArraySynchronizer< Device, Index > + { + using Base = ByteArraySynchronizer< Device, Index >; + + SubrangeType localRange; + int overlaps; + CommunicationGroup group; + + public: + using ByteArrayView = typename Base::ByteArrayView; + + ArraySynchronizer() = delete; + + ArraySynchronizer( SubrangeType localRange, int overlaps, CommunicationGroup group ) + : localRange(localRange), overlaps(overlaps), group(group) + {} + + virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override + { + TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps), + "unexpected array size" ); + + const int rank = Communicator::GetRank( group ); + const int nproc = Communicator::GetSize( group ); + const int left = (rank > 0) ? rank - 1 : nproc - 1; + const int right = (rank < nproc - 1) ? rank + 1 : 0; + + // buffer for asynchronous communication requests + std::vector< typename Communicator::Request > requests; + + // issue all async receive operations + requests.push_back( Communicator::IRecv( + array.getData() + bytesPerValue * localRange.getSize(), + bytesPerValue * overlaps, + left, 0, group ) ); + requests.push_back( Communicator::IRecv( + array.getData() + bytesPerValue * (localRange.getSize() + overlaps), + bytesPerValue * overlaps, + right, 0, group ) ); + + // issue all async send operations + requests.push_back( Communicator::ISend( + array.getData(), + bytesPerValue * overlaps, + left, 0, group ) ); + requests.push_back( Communicator::ISend( + array.getData() + bytesPerValue * (localRange.getSize() - overlaps), + bytesPerValue * overlaps, + right, 0, group ) ); + + // wait for all communications to finish + Communicator::WaitAll( requests.data(), requests.size() ); + } + }; +}; } // namespace Containers } // namespace TNL diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h index 40a675f1a..d42ce0ae7 100644 --- a/src/TNL/Matrices/DistributedMatrix_impl.h +++ b/src/TNL/Matrices/DistributedMatrix_impl.h @@ -177,7 +177,7 @@ DistributedMatrix< Matrix, Communicator >:: getCompressedRowLengths( Vector& rowLengths ) const { if( getCommunicationGroup() != CommunicatorType::NullGroup ) { - rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() ); + rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() ); auto localRowLengths = rowLengths.getLocalView(); localMatrix.getCompressedRowLengths( localRowLengths ); } diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index 097a60d26..d201a0a09 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -13,6 +13,8 @@ #include #include +#include "VectorHelperFunctions.h" + using namespace TNL; using namespace TNL::Containers; @@ -45,13 +47,20 @@ protected: const int rank = CommunicatorType::GetRank(group); const int nproc = CommunicatorType::GetSize(group); + // some arbitrary even value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; + DistributedArrayTest() { using LocalRangeType = typename DistributedArray::LocalRangeType; const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - distributedArray.setDistribution( localRange, globalSize, group ); + distributedArray.setDistribution( localRange, ghosts, globalSize, group ); + + using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >; + distributedArray.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); EXPECT_EQ( distributedArray.getLocalRange(), localRange ); + EXPECT_EQ( distributedArray.getGhosts(), ghosts ); EXPECT_EQ( distributedArray.getCommunicationGroup(), group ); } }; @@ -67,6 +76,14 @@ using DistributedArrayTypes = ::testing::Types< TYPED_TEST_SUITE( DistributedArrayTest, DistributedArrayTypes ); +TYPED_TEST( DistributedArrayTest, checkLocalSizes ) +{ + EXPECT_EQ( this->distributedArray.getLocalView().getSize(), this->distributedArray.getLocalRange().getSize() ); + EXPECT_EQ( this->distributedArray.getConstLocalView().getSize(), this->distributedArray.getLocalRange().getSize() ); + EXPECT_EQ( this->distributedArray.getLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts ); + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts ); +} + TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes ) { using CommunicatorType = typename TestFixture::CommunicatorType; @@ -85,14 +102,25 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal ) this->distributedArray.setValue( 0.0 ); ArrayType globalArray( this->globalSize ); - globalArray.setValue( 1.0 ); + setLinearSequence( globalArray ); this->distributedArray.copyFromGlobal( globalArray ); - ArrayViewType localArrayView = this->distributedArray.getLocalView(); - auto globalView = globalArray.getConstView(); const auto localRange = this->distributedArray.getLocalRange(); - globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getEnd() - localRange.getBegin() ); + ArrayViewType localArrayView; + localArrayView.bind( this->distributedArray.getLocalView().getData(), localRange.getSize() ); + auto globalView = globalArray.getConstView(); + globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getSize() ); EXPECT_EQ( localArrayView, globalView ); + + // check ghost values + for( int o = 0; o < this->ghosts / 2; o++ ) { + const int left_i = localRange.getSize() + o; + const int left_gi = ((this->rank > 0) ? localRange.getBegin() : this->globalSize) - this->ghosts / 2 + o; + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( left_i ), globalArray.getElement( left_gi ) ); + const int right_i = localRange.getSize() + this->ghosts / 2 + o; + const int right_gi = ((this->rank < this->nproc - 1) ? localRange.getEnd() : 0) + o; + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( right_i ), globalArray.getElement( right_gi ) ); + } } TYPED_TEST( DistributedArrayTest, setLike ) @@ -129,6 +157,27 @@ TYPED_TEST( DistributedArrayTest, setValue ) EXPECT_EQ( localArrayView, expected ); } +TYPED_TEST( DistributedArrayTest, setValueGhosts ) +{ + using ArrayViewType = typename TestFixture::ArrayViewType; + using ArrayType = typename TestFixture::ArrayType; + + this->distributedArray.setValue( this->rank ); + ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts(); + ArrayType expected( localArrayView.getSize() ); + expected.setValue( this->rank ); + + // set expected ghost values + const int left = (this->rank > 0) ? this->rank - 1 : this->nproc - 1; + const int right = (this->rank < this->nproc - 1) ? this->rank + 1 : 0; + for( int o = 0; o < this->ghosts / 2; o++ ) { + expected.setElement( this->distributedArray.getLocalRange().getSize() + o, left ); + expected.setElement( this->distributedArray.getLocalRange().getSize() + this->ghosts / 2 + o, right ); + } + + EXPECT_EQ( localArrayView, expected ); +} + TYPED_TEST( DistributedArrayTest, elementwiseAccess ) { using ArrayViewType = typename TestFixture::ArrayViewType; @@ -139,7 +188,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) const auto localRange = this->distributedArray.getLocalRange(); // check initial value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), 0 ); EXPECT_EQ( this->distributedArray.getElement( gi ), 0 ); @@ -149,13 +198,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) } // use setValue - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i + 1 ); } // check set value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), i + 1 ); EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 ); @@ -168,13 +217,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) // use operator[] if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray[ gi ] = i + 1; } // check set value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), i + 1 ); EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 ); @@ -189,8 +238,9 @@ TYPED_TEST( DistributedArrayTest, copyConstructor ) this->distributedArray.setValue( 1 ); DistributedArrayType copy( this->distributedArray ); - // Array has "binding" copy-constructor - //EXPECT_EQ( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() ); + // no binding, but deep copy + EXPECT_NE( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() ); + EXPECT_EQ( copy.getLocalView(), this->distributedArray.getLocalView() ); } TYPED_TEST( DistributedArrayTest, copyAssignment ) @@ -216,7 +266,7 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators ) v.setLike( u ); w.setLike( u ); - for( int i = 0; i < u.getLocalView().getSize(); i ++ ) { + for( int i = 0; i < localRange.getSize(); i ++ ) { const IndexType gi = localRange.getGlobalIndex( i ); u.setElement( gi, i ); v.setElement( gi, i ); @@ -245,7 +295,7 @@ TYPED_TEST( DistributedArrayTest, containsValue ) const auto localRange = this->distributedArray.getLocalRange(); - for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) { + for( int i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i % 10 ); } @@ -263,7 +313,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue ) const auto localRange = this->distributedArray.getLocalRange(); - for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) { + for( int i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i % 10 ); } diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index 1d727aef6..5a201980c 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -56,11 +56,21 @@ protected: // scan with multiple CUDA grids const int globalSize = 10000 * nproc; + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; + DistributedVectorTest() { using LocalRangeType = typename DistributedVector::LocalRangeType; const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - v.setDistribution( localRange, globalSize, group ); + v.setDistribution( localRange, ghosts, globalSize, group ); + + using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >; + using HostSynchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< Devices::Sequential >; + v.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); + v_view.setSynchronizer( v.getSynchronizer() ); + v_host.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); + v_view.bind( v ); setConstantSequence( v, 1 ); } @@ -77,6 +87,8 @@ using DistributedVectorTypes = ::testing::Types< TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes ); +// TODO: test that horizontal operations are computed for ghost values without synchronization + TYPED_TEST( DistributedVectorTest, scan ) { using RealType = typename TestFixture::DistributedVectorType::RealType; diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index 7f81d87f5..b659beaea 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -66,6 +66,14 @@ protected: "CommunicatorType must be the same for both Left and Right vectors." ); using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >; using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >; + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using LeftVector = Vector< LeftReal, typename Left::DeviceType, typename Left::IndexType >; using RightVector = Vector< RightReal, typename Right::DeviceType, typename Right::IndexType >; @@ -89,14 +97,20 @@ protected: R2 = 2; #else #ifdef DISTRIBUTED_VECTOR - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; using LocalRangeType = typename LeftVector::LocalRangeType; + using Synchronizer = typename Partitioner< typename Left::IndexType, CommunicatorType >::template ArraySynchronizer< typename Left::DeviceType >; const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group ); - _L1.setDistribution( localRange, size, group ); - _L2.setDistribution( localRange, size, group ); - _R1.setDistribution( localRange, size, group ); - _R2.setDistribution( localRange, size, group ); + _L1.setDistribution( localRange, ghosts, size, group ); + _L2.setDistribution( localRange, ghosts, size, group ); + _R1.setDistribution( localRange, ghosts, size, group ); + _R2.setDistribution( localRange, ghosts, size, group ); + + auto synchronizer = std::make_shared( localRange, ghosts / 2, group ); + _L1.setSynchronizer( synchronizer ); + _L2.setSynchronizer( synchronizer ); + _R1.setSynchronizer( synchronizer ); + _R2.setSynchronizer( synchronizer ); #else _L1.setSize( size ); _L2.setSize( size ); diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h index 649de1cee..b7e8a1b95 100644 --- a/src/UnitTests/Containers/VectorHelperFunctions.h +++ b/src/UnitTests/Containers/VectorHelperFunctions.h @@ -9,15 +9,17 @@ void setLinearSequence( Vector& deviceVector ) #ifdef STATIC_VECTOR Vector a; #else - using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >; + using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >; HostVector a; a.setLike( deviceVector ); #endif #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = gi; } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = -1; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = i; @@ -62,10 +64,12 @@ void setNegativeLinearSequence( Vector& deviceVector ) HostVector a; a.setLike( deviceVector ); #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = -gi; } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = 1; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = -i; @@ -85,10 +89,12 @@ void setOscilatingSequence( Vector& deviceVector, a.setLike( deviceVector ); #endif #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = v * std::pow( -1, gi ); } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = 42; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = v * std::pow( -1, i ); diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index 867adb069..27422513b 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -55,6 +55,14 @@ protected: using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; template< typename Real > using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + // some arbitrary even value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > @@ -167,13 +175,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); using VectorType = typename TestFixture::VectorType; \ using VectorOrView = typename TestFixture::VectorOrView; \ using CommunicatorType = typename VectorOrView::CommunicatorType; \ - const auto group = CommunicatorType::AllGroup; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ VectorType _V1, _V2; \ - _V1.setDistribution( localRange, size, group ); \ - _V2.setDistribution( localRange, size, group ); \ + _V1.setDistribution( localRange, this->ghosts, size, this->group ); \ + _V2.setDistribution( localRange, this->ghosts, size, this->group ); \ + \ + auto _synchronizer = std::make_shared( localRange, this->ghosts / 2, this->group ); \ + _V1.setSynchronizer( _synchronizer ); \ + _V2.setSynchronizer( _synchronizer ); \ \ _V1 = 1; \ _V2 = 2; \ @@ -188,14 +200,14 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \ using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \ using CommunicatorType = typename VectorOrView::CommunicatorType; \ - const auto group = CommunicatorType::AllGroup; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ HostVector _V1h; \ HostExpectedVector expected_h; \ - _V1h.setDistribution( localRange, size, group ); \ - expected_h.setDistribution( localRange, size, group ); \ + _V1h.setDistribution( localRange, this->ghosts, size, this->group ); \ + expected_h.setDistribution( localRange, this->ghosts, size, this->group ); \ \ const double h = (double) (end - begin) / size; \ for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \ @@ -204,10 +216,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); _V1h[ i ] = x; \ expected_h[ i ] = function(x); \ } \ + for( int i = localRange.getSize(); i < _V1h.getLocalView().getSize(); i++ ) \ + _V1h.getLocalView()[ i ] = expected_h.getLocalView()[ i ] = 0; \ \ VectorType _V1; _V1 = _V1h; \ VectorOrView V1( _V1 ); \ ExpectedVector expected; expected = expected_h; \ + \ + auto _synchronizer = std::make_shared( localRange, this->ghosts / 2, this->group ); \ + _V1.setSynchronizer( _synchronizer ); \ + expected.setSynchronizer( _synchronizer ); \ + expected.startSynchronization(); \ #else #define SETUP_UNARY_VECTOR_TEST( size ) \ diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index ac7fa79d6..4ad0c8303 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -56,6 +56,14 @@ protected: using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; template< typename Real > using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > @@ -75,11 +83,11 @@ protected: setLinearSequence( V1 ); #else #ifdef DISTRIBUTED_VECTOR - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; using LocalRangeType = typename VectorOrView::LocalRangeType; + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); - - _V1.setDistribution( localRange, size, group ); + _V1.setDistribution( localRange, ghosts, size, group ); + _V1.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); #else _V1.setSize( size ); #endif diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h index ea5a7e582..4cc584672 100644 --- a/src/UnitTests/Matrices/DistributedMatrixTest.h +++ b/src/UnitTests/Matrices/DistributedMatrixTest.h @@ -89,7 +89,7 @@ protected: using LocalRangeType = typename DistributedMatrix::LocalRangeType; const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); matrix.setDistribution( localRange, globalSize, globalSize, group ); - rowCapacities.setDistribution( localRange, globalSize, group ); + rowCapacities.setDistribution( localRange, 0, globalSize, group ); EXPECT_EQ( matrix.getLocalRowRange(), localRange ); EXPECT_EQ( matrix.getCommunicationGroup(), group ); @@ -215,7 +215,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput ) GlobalVector inVector( this->globalSize ); inVector.setValue( 1 ); - DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); this->matrix.vectorProduct( inVector, outVector ); EXPECT_EQ( outVector, this->rowCapacities ) @@ -230,9 +230,9 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput ) this->matrix.setRowCapacities( this->rowCapacities ); setMatrix( this->matrix, this->rowCapacities ); - DistributedVector inVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector inVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); inVector.setValue( 1 ); - DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); this->matrix.vectorProduct( inVector, outVector ); EXPECT_EQ( outVector, this->rowCapacities ) -- GitLab From 4f5444a5f2bacf1d589714222229f57329eb5991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 18 Nov 2020 23:49:42 +0100 Subject: [PATCH 25/50] DistributedMatrix: implemented vectorProduct using ghost ranges --- src/TNL/Matrices/DistributedMatrix_impl.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h index d42ce0ae7..b9638e002 100644 --- a/src/TNL/Matrices/DistributedMatrix_impl.h +++ b/src/TNL/Matrices/DistributedMatrix_impl.h @@ -285,7 +285,6 @@ DistributedMatrix< Matrix, Communicator >:: vectorProduct( const InVector& inVector, OutVector& outVector ) const { - TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" ); TNL_ASSERT_EQ( inVector.getLocalRange(), getLocalRowRange(), "input vector has wrong distribution" ); TNL_ASSERT_EQ( inVector.getCommunicationGroup(), getCommunicationGroup(), "input vector has wrong communication group" ); TNL_ASSERT_EQ( outVector.getSize(), getRows(), "output vector has wrong size" ); @@ -295,7 +294,24 @@ vectorProduct( const InVector& inVector, if( getCommunicationGroup() == CommunicatorType::NullGroup ) return; - const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() ); + if( inVector.getGhosts() == 0 ) { + // NOTE: this branch is deprecated and kept only due to existing benchmarks + TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" ); + const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() ); + } + else { + TNL_ASSERT_EQ( inVector.getConstLocalViewWithGhosts().getSize(), localMatrix.getColumns(), "the matrix uses non-local and non-ghost column indices" ); + TNL_ASSERT_EQ( inVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "input vector has wrong ghosts size" ); + TNL_ASSERT_EQ( outVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "output vector has wrong ghosts size" ); + TNL_ASSERT_EQ( outVector.getConstLocalView().getSize(), localMatrix.getRows(), "number of local matrix rows does not match the output vector local size" ); + + inVector.waitForSynchronization(); + const auto inView = inVector.getConstLocalViewWithGhosts(); + auto outView = outVector.getLocalView(); + localMatrix.vectorProduct( inView, outView ); + // TODO: synchronization is not always necessary, e.g. when a preconditioning step follows +// outVector.startSynchronization(); + } } template< typename Matrix, -- GitLab From 364f03bf3078fe101cbf6b67d71a54ca2a8f3800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 20 Nov 2020 14:30:31 +0100 Subject: [PATCH 26/50] Fixed the diagonal preconditioner for ghost ranges --- .../Linear/Preconditioners/Diagonal_impl.h | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index f30151548..474a78f21 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -49,14 +49,7 @@ void Diagonal< Matrix >:: solve( ConstVectorViewType b, VectorViewType x ) const { - ConstVectorViewType diag_view( diagonal ); - - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - x[ i ] = b[ i ] / diag_view[ i ]; - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + x = b / diagonal; } @@ -66,23 +59,32 @@ Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >:: update( const MatrixPointer& matrixPointer ) { TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" ); - TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" ); - diagonal.setSize( matrixPointer->getLocalMatrix().getRows() ); LocalViewType diag_view( diagonal ); // FIXME: SparseMatrix::getConstView is broken // const auto matrix_view = matrixPointer->getLocalMatrix().getConstView(); const auto matrix_view = matrixPointer->getLocalMatrix().getView(); - const auto row_range = matrixPointer->getLocalRowRange(); - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - const IndexType gi = row_range.getGlobalIndex( i ); - diag_view[ i ] = matrix_view.getElement( i, gi ); - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + if( matrixPointer->getRows() == matrixPointer->getColumns() ) { + // square matrix, assume global column indices + const auto row_range = matrixPointer->getLocalRowRange(); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + const IndexType gi = row_range.getGlobalIndex( i ); + diag_view[ i ] = matrix_view.getElement( i, gi ); + }; + Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + } + else { + // non-square matrix, assume ghost indexing + TNL_ASSERT_LT( matrixPointer->getLocalMatrix().getRows(), matrixPointer->getLocalMatrix().getColumns(), "the local matrix should have more columns than rows" ); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + diag_view[ i ] = matrix_view.getElement( i, i ); + }; + Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + } } template< typename Matrix, typename Communicator > @@ -94,15 +96,14 @@ solve( ConstVectorViewType b, VectorViewType x ) const const auto b_view = b.getConstLocalView(); auto x_view = x.getLocalView(); - TNL_ASSERT_EQ( b_view.getSize(), diagonal.getSize(), "The size of the vector b does not match the size of the extracted diagonal." ); - TNL_ASSERT_EQ( x_view.getSize(), diagonal.getSize(), "The size of the vector x does not match the size of the extracted diagonal." ); + // wait for pending synchronization + b.waitForSynchronization(); - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - x_view[ i ] = b_view[ i ] / diag_view[ i ]; - }; + // compute without ghosts (diagonal includes only local rows) + x_view = b_view / diag_view; - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + // synchronize ghosts + x.startSynchronization(); } } // namespace Preconditioners -- GitLab From c8815636058e85882b31967bda048284af8f6511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 2 Dec 2020 17:00:13 +0100 Subject: [PATCH 27/50] Fixed hardcoded entity shapes in VTUWriter --- src/TNL/Meshes/Writers/VTUWriter.hpp | 41 +++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp index 61872ffe1..c8093010d 100644 --- a/src/TNL/Meshes/Writers/VTUWriter.hpp +++ b/src/TNL/Meshes/Writers/VTUWriter.hpp @@ -83,6 +83,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -94,7 +95,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, connectivity.push_back( i ); connectivity.push_back( i+1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -106,6 +107,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -116,7 +118,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, { connectivity.push_back( i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -128,6 +130,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 2 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 2 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -142,7 +145,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -154,6 +157,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -161,21 +165,21 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, std::vector< std::uint8_t > & types ) { for( MeshIndex j = 0; j < mesh.getDimensions().y(); j++ ) - for( MeshIndex i = 0; i < ( mesh.getDimensions().x() + 1 ); i++ ) + for( MeshIndex i = 0; i < (mesh.getDimensions().x() + 1); i++ ) { connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } - for( MeshIndex j = 0; j < (mesh.getDimensions().y()+1); j++ ) + for( MeshIndex j = 0; j < (mesh.getDimensions().y() + 1); j++ ) for( MeshIndex i = 0; i < mesh.getDimensions().x(); i++ ) { connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -187,6 +191,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -198,7 +203,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, { connectivity.push_back( j * mesh.getDimensions().x() + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -210,6 +215,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 3 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -229,7 +235,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Voxel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -241,6 +247,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 2 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -256,7 +263,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ ) @@ -268,7 +275,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ ) @@ -280,7 +287,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -292,6 +299,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -305,7 +313,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ ) @@ -315,7 +323,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ ) @@ -325,7 +333,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -337,6 +345,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -349,7 +358,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, { connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; -- GitLab From 99b346f6dee1fbb831eb315b36d556ab3dfc6084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 2 Dec 2020 18:24:57 +0100 Subject: [PATCH 28/50] Added overload of getEntityMeasure for hexahedrons --- src/TNL/Meshes/Geometry/getEntityCenter.h | 2 +- src/TNL/Meshes/Geometry/getEntityMeasure.h | 24 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h index 6e869f6ec..addef6b9f 100644 --- a/src/TNL/Meshes/Geometry/getEntityCenter.h +++ b/src/TNL/Meshes/Geometry/getEntityCenter.h @@ -39,7 +39,7 @@ getEntityCenter( const Mesh< MeshConfig, Device > & mesh, /* * Get an arithmetic mean of the entity's subvertices. * - * For an simplex entity this corresponds to the centroid of the entity, but + * For a simplex entity this corresponds to the centroid of the entity, but * note that other shapes such as general polygons have different formulas for * the centroid: https://en.wikipedia.org/wiki/Centroid#Centroid_of_a_polygon */ diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h index 70d5614ce..fb1e2d468 100644 --- a/src/TNL/Meshes/Geometry/getEntityMeasure.h +++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace TNL { namespace Meshes { @@ -148,5 +149,28 @@ getEntityMeasure( const Mesh< MeshConfig, Device > & mesh, return getTetrahedronVolume( v3 - v0, v2 - v0, v1 - v0 ); } +template< typename MeshConfig, typename Device > +__cuda_callable__ +typename MeshConfig::RealType +getEntityMeasure( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, Topologies::Hexahedron > & entity ) +{ + const auto& v0 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 1 ) ); + const auto& v2 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 2 ) ); + const auto& v3 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 3 ) ); + const auto& v4 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 4 ) ); + const auto& v5 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 5 ) ); + const auto& v6 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 6 ) ); + const auto& v7 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 7 ) ); + // https://www.cfd-online.com/Forums/main/163122-volume-general-hexahedron.html#post574650 + return getTetrahedronVolume( v0 - v4, v3 - v4, v1 - v4 ) + + getTetrahedronVolume( v2 - v4, v3 - v4, v1 - v4 ) + + getTetrahedronVolume( v1 - v4, v2 - v4, v5 - v4 ) + + getTetrahedronVolume( v6 - v4, v2 - v4, v5 - v4 ) + + getTetrahedronVolume( v3 - v4, v2 - v4, v7 - v4 ) + + getTetrahedronVolume( v6 - v4, v2 - v4, v7 - v4 ); +} + } // namespace Meshes } // namespace TNL -- GitLab From 45e6fc1a6ec563b949c51cff456804b4023d4eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 2 Dec 2020 17:08:05 +0100 Subject: [PATCH 29/50] pytnl: added bindings for MeshOfQuadrangles and MeshOfHexahedrons --- src/Python/pytnl/tnl/Mesh.cpp | 2 ++ src/Python/pytnl/tnl/MeshReaders.cpp | 2 ++ src/Python/pytnl/tnl/MeshWriters.cpp | 4 +++ src/Python/pytnl/tnl_mpi/DistributedMesh.cpp | 2 ++ .../pytnl/tnl_mpi/DistributedMeshReaders.cpp | 2 ++ .../pytnl/tnl_mpi/DistributedMeshWriters.cpp | 2 ++ src/Python/pytnl/tnl_mpi/tnl_mpi.cpp | 4 +++ src/Python/pytnl/typedefs.h | 32 ++++++++----------- 8 files changed, 32 insertions(+), 18 deletions(-) diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp index a3e582680..48e3f939b 100644 --- a/src/Python/pytnl/tnl/Mesh.cpp +++ b/src/Python/pytnl/tnl/Mesh.cpp @@ -7,5 +7,7 @@ void export_Meshes( py::module & m ) { export_Mesh< MeshOfEdges >( m, "MeshOfEdges" ); export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" ); + export_Mesh< MeshOfQuadrangles >( m, "MeshOfQuadrangles" ); export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" ); + export_Mesh< MeshOfHexahedrons >( m, "MeshOfHexahedrons" ); } diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp index d47ec5268..c4abae015 100644 --- a/src/Python/pytnl/tnl/MeshReaders.cpp +++ b/src/Python/pytnl/tnl/MeshReaders.cpp @@ -17,7 +17,9 @@ void export_MeshReaders( py::module & m ) .def("detectMesh", &MeshReader::detectMesh) .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >) .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfQuadrangles >) .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfHexahedrons >) ; py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" ) diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp index 17c3c7492..78eca5e05 100644 --- a/src/Python/pytnl/tnl/MeshWriters.cpp +++ b/src/Python/pytnl/tnl/MeshWriters.cpp @@ -83,6 +83,10 @@ void export_MeshWriters( py::module & m ) export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" ); export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTriangles" ); export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfQuadrangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfQuadrangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfQuadrangles >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfQuadrangles" ); export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTetrahedrons" ); export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfHexahedrons >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfHexahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfHexahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfHexahedrons" ); } diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp index 03ee3692e..0af175f3c 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp @@ -12,7 +12,9 @@ void export_DistributedMeshes( py::module & m ) export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" ); export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" ); + export_DistributedMesh< DistributedMeshOfQuadrangles >( m, "DistributedMeshOfQuadrangles" ); export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" ); + export_DistributedMesh< DistributedMeshOfHexahedrons >( m, "DistributedMeshOfHexahedrons" ); // export VTKTypesArrayType using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType; diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp index e972eb65e..c196a67cc 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp @@ -19,6 +19,8 @@ void export_DistributedMeshReaders( py::module & m ) // loadMesh is not virtual in PVTUReader .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >) .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfQuadrangles >) .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfHexahedrons >) ; } diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp index 4d1d18bae..089d59adf 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp @@ -90,5 +90,7 @@ void export_DistributedMeshWriters( py::module & m ) constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed; export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges, default_format >( m, "PVTUWriter_MeshOfEdges" ); export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles, default_format >( m, "PVTUWriter_MeshOfTriangles" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfQuadrangles, default_format >( m, "PVTUWriter_MeshOfQuadrangles" ); export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfHexahedrons, default_format >( m, "PVTUWriter_MeshOfHexahedrons" ); } diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp index be7813959..a12060600 100644 --- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -39,6 +39,10 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m) using TNL::Meshes::DistributedMeshes::distributeSubentities; m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) { distributeSubentities< 1 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfQuadrangles& mesh ) { + distributeSubentities< 1 >( mesh ); }); m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) { distributeSubentities< 2 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfHexahedrons& mesh ) { + distributeSubentities< 2 >( mesh ); }); } diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h index ac4b6bd83..7bc9fe025 100644 --- a/src/Python/pytnl/typedefs.h +++ b/src/Python/pytnl/typedefs.h @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include using RealType = double; using DeviceType = TNL::Devices::Host; @@ -31,28 +33,22 @@ using Grid2D = TNL::Meshes::Grid<2, RealType, DeviceType, IndexType>; using Grid3D = TNL::Meshes::Grid<3, RealType, DeviceType, IndexType>; using LocalIndexType = short int; -using EdgeTopology = TNL::Meshes::Topologies::Edge; -using TriangleTopology = TNL::Meshes::Topologies::Triangle; -using TetrahedronTopology = TNL::Meshes::Topologies::Tetrahedron; -using MeshOfEdges = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - EdgeTopology, - EdgeTopology::dimension, - RealType, - IndexType, - LocalIndexType > >; -using MeshOfTriangles = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - TriangleTopology, - TriangleTopology::dimension, - RealType, - IndexType, - LocalIndexType > >; -using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - TetrahedronTopology, - TetrahedronTopology::dimension, +template< typename Topology > +using DefaultMeshTemplate = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< + Topology, + Topology::dimension, RealType, IndexType, LocalIndexType > >; +using MeshOfEdges = DefaultMeshTemplate< TNL::Meshes::Topologies::Edge >; +using MeshOfTriangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Triangle >; +using MeshOfQuadrangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Quadrangle >; +using MeshOfTetrahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Tetrahedron >; +using MeshOfHexahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Hexahedron >; + using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >; using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >; +using DistributedMeshOfQuadrangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfQuadrangles >; using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >; +using DistributedMeshOfHexahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfHexahedrons >; -- GitLab From 966d73877cfec43065f55a112a0762e846f02c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 5 Dec 2020 20:54:10 +0100 Subject: [PATCH 30/50] pytnl: for completeness, export bindings for mesh writers for grids --- src/Python/pytnl/tnl/MeshWriters.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp index 78eca5e05..01f79ce2d 100644 --- a/src/Python/pytnl/tnl/MeshWriters.cpp +++ b/src/Python/pytnl/tnl/MeshWriters.cpp @@ -79,6 +79,13 @@ void export_MeshWriter( py::module & m, const char* name ) void export_MeshWriters( py::module & m ) { + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid1D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid1D" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid2D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid2D" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid3D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid3D" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfEdges" ); export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" ); export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTriangles" ); -- GitLab From 863a4f698e894146739e957e16c8a99f24a5906c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 20 Dec 2020 17:00:20 +0100 Subject: [PATCH 31/50] getOutwardNormalVector: added overloads for 2D and 3D unstructured meshes --- .../Meshes/Geometry/getOutwardNormalVector.h | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h index 536800862..d3fa6ea50 100644 --- a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h +++ b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h @@ -11,6 +11,7 @@ #pragma once #include +#include namespace TNL { namespace Meshes { @@ -87,5 +88,63 @@ getOutwardNormalVector( const Grid & grid, } } +template< typename MeshConfig, typename Device > +__cuda_callable__ +typename MeshTraits< MeshConfig >::PointType +getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, Topologies::Edge > & face, + typename MeshTraits< MeshConfig >::PointType cellCenter ) +{ + using MeshType = Mesh< MeshConfig, Device >; + using FaceType = MeshEntity< MeshConfig, Device, Topologies::Edge >; + using PointType = typename MeshTraits< MeshConfig >::PointType; + static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" ); + static_assert( MeshConfig::worldDimension == 2, "TODO: normal vectors for 2D meshes in a 3D space are not implemented yet" ); + + const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) ); + const PointType u = v0 - v1; + const PointType n {u[1], -u[0]}; + + // check on which side of the face is the reference cell center + const PointType faceCenter = getEntityCenter( mesh, face ); + if( dot( n, cellCenter - faceCenter ) < 0 ) + return n / l2Norm( n ); + else + return - n / l2Norm( n ); +} + +template< typename MeshConfig, typename Device, typename EntityTopology > +__cuda_callable__ +typename MeshTraits< MeshConfig >::PointType +getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, EntityTopology > & face, + typename MeshTraits< MeshConfig >::PointType cellCenter ) +{ + using MeshType = Mesh< MeshConfig, Device >; + using FaceType = MeshEntity< MeshConfig, Device, EntityTopology >; + using PointType = typename MeshTraits< MeshConfig >::PointType; + static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" ); + static_assert( MeshConfig::worldDimension == 3, "general overload intended for 3D was called with the wrong world dimension" ); + + const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) ); + const auto& v2 = mesh.getPoint( face.template getSubentityIndex< 0 >( 2 ) ); + const PointType u1 = v0 - v1; + const PointType u2 = v0 - v2; + const PointType n { + u1.y() * u2.z() - u1.z() * u2.y(), // first component of the cross product + u1.z() * u2.x() - u1.x() * u2.z(), // second component of the cross product + u1.x() * u2.y() - u1.y() * u2.x() // third component of the cross product + }; + + // check on which side of the face is the reference cell center + const PointType faceCenter = getEntityCenter( mesh, face ); + if( dot( n, cellCenter - faceCenter ) < 0 ) + return n / l2Norm( n ); + else + return - n / l2Norm( n ); +} + } // namespace Meshes } // namespace TNL -- GitLab From 65e1cc9e42c05068d3fcc5080f1e38451f384d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 26 Dec 2020 14:46:48 +0100 Subject: [PATCH 32/50] Fixed ILU preconditioners for distributed matrices --- src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h | 3 --- src/TNL/Solvers/Linear/Preconditioners/ILU0.h | 7 ++++++- src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h | 3 +++ src/TNL/Solvers/Linear/Preconditioners/ILUT.h | 7 ++++++- src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h | 3 +++ src/TNL/Solvers/Linear/Traits.h | 4 ++++ 6 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index 474a78f21..d2227e57b 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -96,9 +96,6 @@ solve( ConstVectorViewType b, VectorViewType x ) const const auto b_view = b.getConstLocalView(); auto x_view = x.getLocalView(); - // wait for pending synchronization - b.waitForSynchronization(); - // compute without ghosts (diagonal includes only local rows) x_view = b_view / diag_view; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index c4b409bb3..857d8a063 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -90,7 +90,12 @@ protected: template< typename M > static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m ) { - return m.getLocalRowRange().getBegin(); + if( m.getRows() == m.getColumns() ) + // square matrix, assume global column indices + return m.getLocalRowRange().getBegin(); + else + // non-square matrix, assume ghost indexing + return 0; } }; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h index c11909c07..f68a93f16 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h @@ -145,6 +145,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const // Step 2: solve x from Ux = y triangularSolveUpper< true, true >( U, x, x ); + + // synchronize ghosts + Traits< Matrix >::startSynchronization( _x ); } diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h index d46f3f900..344daf1a0 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h @@ -79,7 +79,12 @@ protected: template< typename M > static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m ) { - return m.getLocalRowRange().getBegin(); + if( m.getRows() == m.getColumns() ) + // square matrix, assume global column indices + return m.getLocalRowRange().getBegin(); + else + // non-square matrix, assume ghost indexing + return 0; } }; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h index c9c2a0b77..21b895c48 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h @@ -272,6 +272,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const // Step 2: solve x from Ux = y triangularSolveUpper< true, false >( U, x, x ); + + // synchronize ghosts + Traits< Matrix >::startSynchronization( _x ); } } // namespace Preconditioners diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 5f93e0cde..83313ed98 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -52,6 +52,8 @@ struct Traits static LocalViewType getLocalView( VectorViewType v ) { return v; } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } + static void startSynchronization( VectorViewType v ) {} + static void waitForSynchronization( VectorViewType v ) {} }; template< typename Matrix, typename Communicator > @@ -95,6 +97,8 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } + static void startSynchronization( VectorViewType v ) { v.startSynchronization(); } + static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); } }; } // namespace Linear -- GitLab From b9d087074e03d8ff2cac6cddab5b7667f8b31aad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 27 Dec 2020 18:29:59 +0100 Subject: [PATCH 33/50] Use MPI_Init_thread instead of MPI_Init This allows the user to set the required thread level. Initializing MPI with threading support is needed when MPI functions are called from multiple threads. All common MPI libraries seem to provide this feature. --- src/TNL/Communicators/MpiCommunicator.h | 36 +++++++++++++++++++++++-- src/TNL/Communicators/MpiDefs.h | 8 ++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 18143cce0..dedc35f03 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -142,10 +142,42 @@ class MpiCommunicator return true; } - static void Init(int& argc, char**& argv ) + static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { #ifdef HAVE_MPI - MPI_Init( &argc, &argv ); + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + case MPI_THREAD_FUNNELED: + case MPI_THREAD_SERIALIZED: + case MPI_THREAD_MULTIPLE: + break; + default: + printf("ERROR: invalid argument for the 'required' thread level support: %d\n", required_thread_level); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int provided; + MPI_Init_thread( &argc, &argv, required_thread_level, &provided ); + if( provided < required_thread_level ) { + const char* level = ""; + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + level = "MPI_THREAD_SINGLE"; + break; + case MPI_THREAD_FUNNELED: + level = "MPI_THREAD_FUNNELED"; + break; + case MPI_THREAD_SERIALIZED: + level = "MPI_THREAD_SERIALIZED"; + break; + case MPI_THREAD_MULTIPLE: + level = "MPI_THREAD_MULTIPLE"; + break; + } + printf("ERROR: The MPI library does not have the required level of thread support: %s\n", level); + MPI_Abort(MPI_COMM_WORLD, 1); + } + selectGPU(); #endif diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/Communicators/MpiDefs.h index 957354b9d..df43005ec 100644 --- a/src/TNL/Communicators/MpiDefs.h +++ b/src/TNL/Communicators/MpiDefs.h @@ -25,4 +25,12 @@ enum MPI_Op { MPI_MINLOC, MPI_MAXLOC, }; + +// MPI_Init_thread constants +enum { + MPI_THREAD_SINGLE, + MPI_THREAD_FUNNELED, + MPI_THREAD_SERIALIZED, + MPI_THREAD_MULTIPLE +}; #endif -- GitLab From cff4ab335edf759275959b97f38d83645f5ca6d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 27 Dec 2020 18:45:50 +0100 Subject: [PATCH 34/50] Implemented asynchronous operations for ByteArraySynchronizer --- CMakeLists.txt | 2 +- src/3rdparty/async/README.md | 532 ++++++++++++++++++ src/3rdparty/async/bounded_queue.h | 342 +++++++++++ src/3rdparty/async/queue.h | 429 ++++++++++++++ src/3rdparty/async/threadpool.h | 192 +++++++ src/3rdparty/async/utility.h | 66 +++ src/TNL/Containers/ByteArraySynchronizer.h | 117 +++- src/TNL/Containers/DistributedArray.h | 2 + src/TNL/Containers/DistributedArray.hpp | 12 + src/TNL/Containers/DistributedArrayView.h | 4 + src/TNL/Containers/DistributedArrayView.hpp | 27 +- src/TNL/Containers/Partitioner.h | 19 +- .../DistributedMeshSynchronizer.h | 42 +- .../Containers/DistributedArrayTest.h | 6 + 14 files changed, 1758 insertions(+), 34 deletions(-) create mode 100644 src/3rdparty/async/README.md create mode 100644 src/3rdparty/async/bounded_queue.h create mode 100644 src/3rdparty/async/queue.h create mode 100644 src/3rdparty/async/threadpool.h create mode 100644 src/3rdparty/async/utility.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 05a0fd0b6..b85842c1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,7 @@ if( ${WITH_CUDA} ) set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} ) endif() endif() - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda --default-stream per-thread) # disable false compiler warnings # reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937 # incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg diff --git a/src/3rdparty/async/README.md b/src/3rdparty/async/README.md new file mode 100644 index 000000000..36106864a --- /dev/null +++ b/src/3rdparty/async/README.md @@ -0,0 +1,532 @@ +# async +Homepage: https://github.com/d36u9/async + +[[License(Boost Software License - Version 1.0)](http://www.boost.org/LICENSE_1_0.txt)] + +## Welcome +async is a tiny C++ header-only high-performance library for async calls handled by a thread-pool, which is built on top of an unbounded MPMC lock-free queue. +It's written in pure C++14 (C++11 support with preprocessor macros), no dependencies on other 3rd party libraries. + +Note: This library is originally designed for 64bit system. It has been tested on arch X86-64 and ARMV8(64bit), and ARMV7(32bit). + +## change logs +* Jun. 2018: + * Added support for ARMV7 & V8 + * Tested on Raspberry Pi 3 B+ with Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) + * Tested on Raspberry Pi 3 B+ with Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) + * Added Benchmark Results for Raspberry Pi 3 B+ ARMV8 (Linux Pi64 4.14.44-V8 AArch64) + * Added Benchmark Results for Raspberry Pi 3 B+ ARMV7 32bit (Linux 4.14.34-v7 armv7l) +* Sept. 2017: + * Significantly improved the performance of async::queue without bulk operations. + * async::threadpool also benifits from this change. + * A bounded MPMC queue `async::bounded_queue` was added to the lib, which is pretty useful for memory constrainted system or some fixed-size message pipeline design. The overall performance of this buffer based `async::bounded_queue` is comparable to bulk operations of node-based `async::queue`. `async::bounded_queue` shares the almost identical interface as `async::queue`, except for bulk operations, and a size prarameter has to be passed to `bounded_queue`'s constructor, and also added blocking methods (`blocking_enqueue` & `blocking_dequeue`). `TRAIT::NOEXCEPT_CHECK` setting is also similar to `async::queue` to help handle exceptions that may be thrown in element's ctor. `bounded_queue` is basically a C++ implementation of [PTLQueue](https://blogs.oracle.com/dave/ptlqueue-:-a-scalable-bounded-capacity-mpmc-queue) design (Please read Dave Dice's article for details and references). + +## Features +* interchangeable with std::async, accepts all kinds of callable instances, like static functions, member functions, functors, lambdas +* dynamically changeable thread-pool size at run-time +* tasks are managed in a lock-free queue +* provided lock-free queue doesn't have restricted limitation as boost::lockfree::queue +* low-latency for the task execution thanks to underlying lock-free queue + +## Tested Platforms& Compilers +(old versions of OSs or compilers may work, but not tested) +* Windows 10 Visual Studio 2015+ +* Linux Ubuntu 16.04 gcc4.9.2+/clang 3.8+ +* MacOS Sierra 10.12.5 clang-802.0.42 + +## Getting Started +## Building the test& benchmark + +### C++11 compilers +If your compiler only supports C++11, please edit CMakeLists.txt with the following change: +``` +set(CMAKE_CXX_STANDARD 14) +#change to +set(CMAKE_CXX_STANDARD 11) +``` + +### Build& test with Microsoft C++ REST SDK +If your OS is Windows or has cppresetsdk installed& configured on Linux or Mac, please edit CMakeLists.txt to enable PPL test: +``` +option(WITH_CPPRESTSDK "Build Cpprestsdk Test" OFF) +#to +option(WITH_CPPRESTSDK "Build Cpprestsdk Test" ON) +``` + + +### Build for Linux or Mac (x86-64 & ARMV7&V8) +``` +#to use clang (linux) with following export command +#EXPORT CC=clang-3.8 +#EXPORT CXX=clang++-3.8 +#run the following to set up release build, (for MasOS Xcode, you can remove -DCMAKE_BUILD_TYPE for now, and choose build type at build-time) +cmake -H. -Bbuild -DCMAKE_BUILD_TYPE=RELEASE +#now build the release +cmake --build build --config Release +#or debug +cmake --build build --config Debug +#or other builds +cmake --build build --config RelWithDebInfo +cmake --build build --config MinSizeRel +``` + +### Build for Windows (X86-64) +``` +#for VS 2015 +cmake -H. -Bbuild -G "Visual Studio 14 2015 Win64" +#or VS 2017 +cmake -H. -Bbuild -G "Visual Studio 15 2017 Win64" +#build the release from command line or you can open the project file in Visual Studio, and build from there +cmake --build build --config Release +``` + +## How to use it in your project/application +simply copy all headers in async sub-folder to your project, and include those headers in your source code. + +## Thread Pool Indrodction +### Thread Pool intializations + +``` +async::threadpool tp; //by default, thread pool size will be the same number of your hardware CPU core/threads +async::threadpool tp(8); //create a thread pool with 8 threads +async::threadpool tp(0); //create a thread pool with no threads available, it's in pause mode +``` + +### resize the thread pool +``` +async::threadpool tp(32); +...//some operations +tp.configurepool(16);// can be called at anytime (as long as tp is still valid) to reset the pool size + // no interurption for running tasks +``` +### submit the task +*static functions, member functions, functors, lambdas are all supported +``` +int foo(int i) { return ++i; } +auto pkg = tp.post(foo, i); //retuns a std::future +pkg.get(); //will block +``` + +## multi-producer multi-consumer unbounded lock-free queue Indrodction +The design: A simple and classic implementation. It's link-based 3-level depth nested container with local array for each level storage and simulated tagged pointer for linking. +The size of each level, and tag bits can be configured through TRAITS (please see source for details). +The queue with default traits seetings can store up to 1 Trillion elements/nodes (at least 1 Terabyte memory space). + +### element type requirements +* nothrow destructible +* optional (better to be true) + * nothrow constructible + * nothrow move-assignable + +NOTE: the exception thrown by constructor is acceptable. Although it'd be better to keep ctor noexcept if possible. +noexcept detection is turned off by default, it can be turned on by setting `TRAIT::NOEXCEPT_CHECK` to true. +With `TRAIT::NOEXCEPT_CHECK` on(true), queue will enable exception handling if ctor or move assignment may throw exceptions. + + +### queue intializations +``` +async::queue q; //default constructor, it's unbounded + +async::queue q(1000); // pre-allocated 1000 storage nodes, the capcity will increase automatically after 1000 nodes are used +``` +### usage +``` +// enqueues a T constructed from args, supports the following constructions: +// move, if args is a T rvalue +// copy, if args is a T lvalue, or +// emplacement if args is an initializer list that can be passed to a T constructor +async::queue::enqueue(Args... args) + +async::queue::dequeue(T& data) //type T should have move assignment operator, +//e.g. +async::queue q; +q.enqueue(11); +int i(0); +q.dequeue(i); + +``` +### bulk operations +It's convienent for bulk data, and also can boost the throughput. +exception handling is not available in bulk operations even with `TRAIT::NOEXCEPT_CHECK` being true. +bulk operations are suitable for plain data types, like network/event messages. + +``` +int a[] = {1,2,3,4,5}; +int b[5]; +q.bulk_enqueue(std::bengin(a), 5); +auto popcount = q.bulk_dequeue(std::begin(b), 5); //popcount is the number of elemtnets sucessfully pulled from the queue. +//or like the following code: +std::vector v; +auto it = std::inserter(v, std::begin(v)); +popcount = q.bulk_dequeue(it, 5); +``` + +## Unit Test +The unit test code provides most samples for usage. + +## Benchmark +NOTE: the results may vary on different OS platforms and hardware. +### thread pool benchmark +The benchmark is a simple demonstration. +NOTE: may require extra config, please see CMakeLists.txt for detailed settings +The test benchamarks the following task/job based async implementation: +* async::threadpool (this library) +* std::async +* boost::async +* AsioThreadPool (my another implementation based on boost::asio, has very stable and good performance, especially on Windows with iocp) +* Microsoft::PPL (pplx from [cpprestsdk](https://github.com/Microsoft/cpprestsdk) on Linux& MacOS or PPL on windows) + + +e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64) +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 1130 ns max: 1227 ns min: 1066 ns avg_task_post: 1032 ns + *std::async (time/task) avg: 1469 ns max: 1549 ns min: 1423 ns avg_task_post: 1250 ns + *Microsoft::PPL (time/task) avg: 1148 ns max: 1216 ns min: 1114 ns avg_task_post: 1088 ns + AsioThreadPool (time/task) avg: 1166 ns max: 1319 ns min: 1013 ns avg_task_post: 1073 ns + *boost::async (time/task) avg: 29153 ns max: 30028 ns min: 27990 ns avg_task_post: 23343 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 439 ns max: 557 ns min: 398 ns avg_task_post: 356 ns + *std::async (time/task) avg: 800 ns max: 890 ns min: 759 ns avg_task_post: 629 ns + *Microsoft::PPL (time/task) avg: 666 ns max: 701 ns min: 640 ns avg_task_post: 605 ns + AsioThreadPool (time/task) avg: 448 ns max: 541 ns min: 389 ns avg_task_post: 365 ns + *boost::async (time/task) avg: 32419 ns max: 33296 ns min: 30105 ns avg_task_post: 25561 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 262 ns max: 300 ns min: 252 ns avg_task_post: 176 ns + *std::async (time/task) avg: 873 ns max: 961 ns min: 821 ns avg_task_post: 701 ns + *Microsoft::PPL (time/task) avg: 727 ns max: 755 ns min: 637 ns avg_task_post: 662 ns + AsioThreadPool (time/task) avg: 607 ns max: 645 ns min: 567 ns avg_task_post: 210 ns + *boost::async (time/task) avg: 33158 ns max: 150331 ns min: 28560 ns avg_task_post: 28655 ns +``` + +e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0 +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 1320 ns max: 1357 ns min: 1301 ns avg_task_post: 1266 ns + *std::async (time/task) avg: 11817 ns max: 12469 ns min: 11533 ns avg_task_post: 9580 ns + *Microsoft::PPL (time/task) avg: 1368 ns max: 1498 ns min: 1325 ns avg_task_post: 1349 ns + AsioThreadPool (time/task) avg: 1475 ns max: 1499 ns min: 1318 ns avg_task_post: 1332 ns + *boost::async (time/task) avg: 4574 ns max: 4697 ns min: 4450 ns avg_task_post: 4531 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 516 ns max: 688 ns min: 239 ns avg_task_post: 522 ns + *std::async (time/task) avg: 41630 ns max: 44316 ns min: 41334 ns avg_task_post: 38151 ns + *Microsoft::PPL (time/task) avg: 3652 ns max: 3710 ns min: 3598 ns avg_task_post: 3629 ns + AsioThreadPool (time/task) avg: 529 ns max: 814 ns min: 494 ns avg_task_post: 447 ns + *boost::async (time/task) avg: 14634 ns max: 14669 ns min: 14598 ns avg_task_post: 14583 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 398 ns max: 468 ns min: 337 ns avg_task_post: 177 ns + *std::async (time/task) avg: 44603 ns max: 46904 ns min: 44272 ns avg_task_post: 40877 ns + *Microsoft::PPL (time/task) avg: 3714 ns max: 3816 ns min: 3656 ns avg_task_post: 3690 ns + AsioThreadPool (time/task) avg: 564 ns max: 605 ns min: 533 ns avg_task_post: 253 ns + *boost::async (time/task) avg: 20421 ns max: 21738 ns min: 19105 ns avg_task_post: 20375 ns +``` + +e.g. MacOS 10.12.5 clang Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 (Microsoft::PPL(cpprestsdk::pplx) is superisingly good compared with other libraries on MacOS, not sure if it's due to some comipiler optimization) +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 8517 ns max: 8641 ns min: 7400 ns avg_task_post: 8393 ns + *std::async (time/task) avg: 13618 ns max: 13845 ns min: 13276 ns avg_task_post: 13476 ns + *Microsoft::PPL (time/task) avg: 747 ns max: 938 ns min: 626 ns avg_task_post: 718 ns + AsioThreadPool (time/task) avg: 8647 ns max: 8807 ns min: 8558 ns avg_task_post: 8524 ns + *boost::async (time/task) avg: 11732 ns max: 12028 ns min: 11526 ns avg_task_post: 11698 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 5964 ns max: 6017 ns min: 5790 ns avg_task_post: 5830 ns + *std::async (time/task) avg: 9690 ns max: 10043 ns min: 9132 ns avg_task_post: 9531 ns + *Microsoft::PPL (time/task) avg: 380 ns max: 425 ns min: 342 ns avg_task_post: 353 ns + AsioThreadPool (time/task) avg: 6173 ns max: 6459 ns min: 6116 ns avg_task_post: 6042 ns + *boost::async (time/task) avg: 8643 ns max: 9470 ns min: 8513 ns avg_task_post: 8591 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 3469 ns max: 3527 ns min: 3415 ns avg_task_post: 3339 ns + *std::async (time/task) avg: 10902 ns max: 11164 ns min: 10709 ns avg_task_post: 10738 ns + *Microsoft::PPL (time/task) avg: 367 ns max: 426 ns min: 326 ns avg_task_post: 323 ns + AsioThreadPool (time/task) avg: 3920 ns max: 3975 ns min: 3832 ns avg_task_post: 3409 ns + *boost::async (time/task) avg: 9800 ns max: 10223 ns min: 9196 ns avg_task_post: 9744 ns +``` + +e.g. Windows 7 64bit Intel i7-4790 16GB RAM Visual Studio 2015 Update 3 +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 809 ns max: 924 ns min: 687 ns avg_task_post: 774 ns + *std::async (time/task) avg: 1914 ns max: 2032 ns min: 1790 ns avg_task_post: 1877 ns + *Microsoft::PPL (time/task) avg: 1718 ns max: 2181 ns min: 1623 ns avg_task_post: 1677 ns + AsioThreadPool (time/task) avg: 1100 ns max: 1137 ns min: 1076 ns avg_task_post: 1065 ns + *boost::async (time/task) avg: 191532 ns max: 203716 ns min: 186114 ns avg_task_post: 191507 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 423 ns max: 538 ns min: 338 ns avg_task_post: 388 ns + *std::async (time/task) avg: 1249 ns max: 1279 ns min: 1233 ns avg_task_post: 1211 ns + *Microsoft::PPL (time/task) avg: 1229 ns max: 1246 ns min: 1208 ns avg_task_post: 1186 ns + AsioThreadPool (time/task) avg: 563 ns max: 577 ns min: 499 ns avg_task_post: 528 ns + *boost::async (time/task) avg: 95484 ns max: 112569 ns min: 93808 ns avg_task_post: 95458 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 267 ns max: 323 ns min: 255 ns avg_task_post: 232 ns + *std::async (time/task) avg: 1202 ns max: 1257 ns min: 1182 ns avg_task_post: 1009 ns + *Microsoft::PPL (time/task) avg: 1199 ns max: 1262 ns min: 1175 ns avg_task_post: 988 ns + AsioThreadPool (time/task) avg: 783 ns max: 960 ns min: 706 ns avg_task_post: 375 ns + *boost::async (time/task) avg: 103572 ns max: 107041 ns min: 101993 ns avg_task_post: 103542 ns +``` + +e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+ +``` +Benchmark Test Run: 1 Producers 3(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 7809 ns max: 10467 ns min: 7453 ns avg_task_post: 7261 ns + *std::async (time/task) avg: 139664 ns max: 3453077 ns min: 104589 ns avg_task_post: 117819 ns + AsioThreadPool (time/task) avg: 6545 ns max: 8804 ns min: 5678 ns avg_task_post: 5654 ns + *boost::async (time/task) avg: 37629 ns max: 38978 ns min: 36769 ns avg_task_post: 36933 ns + +Benchmark Test Run: 2 Producers 2(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 2207 ns max: 4084 ns min: 1809 ns avg_task_post: 1325 ns + *std::async (time/task) avg: 431781 ns max: 17500817 ns min: 91919 ns avg_task_post: 407595 ns + AsioThreadPool (time/task) avg: 2251 ns max: 3351 ns min: 1839 ns avg_task_post: 1405 ns + *boost::async (time/task) avg: 48456 ns max: 50578 ns min: 46698 ns avg_task_post: 47753 ns + +Benchmark Test Run: 3 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 3346 ns max: 3974 ns min: 2635 ns avg_task_post: 1017 ns + *std::async (time/task) avg: 110853 ns max: 768224 ns min: 103045 ns avg_task_post: 86361 ns + AsioThreadPool (time/task) avg: 3828 ns max: 4209 ns min: 3354 ns avg_task_post: 976 ns + *boost::async (time/task) avg: 59094 ns max: 67042 ns min: 54802 ns avg_task_post: 58365 ns +``` + +### queue benchmark +The benchmark uses producers-consumers model, and doesn't provide all the detailed measurements. +* async::bounded_queue +* async::queue +* boost::lockfree::queue +* boost::lockfree::spsc_queue (only for single-producer-single-consumer test) + +e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64) +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 18 ns max: 55 ns min: 17 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 50 ns min: 23 ns + async::queue (time/op) avg: 28 ns max: 66 ns min: 27 ns +boost::lockfree::queue (time/op) avg: 167 ns max: 195 ns min: 70 ns +boost::lockfree::spsc_queue (time/op) avg: 10 ns max: 38 ns min: 8 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 27 ns max: 62 ns min: 25 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 124 ns min: 24 ns + async::queue (time/op) avg: 42 ns max: 115 ns min: 29 ns +boost::lockfree::queue (time/op) avg: 240 ns max: 576 ns min: 119 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 44 ns max: 78 ns min: 29 ns +async::queue::bulk(16) (time/op) avg: 34 ns max: 109 ns min: 28 ns + async::queue (time/op) avg: 90 ns max: 122 ns min: 44 ns +boost::lockfree::queue (time/op) avg: 213 ns max: 227 ns min: 161 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 53 ns max: 82 ns min: 27 ns +async::queue::bulk(16) (time/op) avg: 34 ns max: 107 ns min: 29 ns + async::queue (time/op) avg: 100 ns max: 114 ns min: 51 ns +boost::lockfree::queue (time/op) avg: 197 ns max: 207 ns min: 186 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 31 ns max: 81 ns min: 25 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 104 ns min: 28 ns + async::queue (time/op) avg: 93 ns max: 117 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 211 ns max: 222 ns min: 162 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 52 ns max: 79 ns min: 30 ns +async::queue::bulk(16) (time/op) avg: 33 ns max: 103 ns min: 29 ns + async::queue (time/op) avg: 94 ns max: 126 ns min: 74 ns +boost::lockfree::queue (time/op) avg: 199 ns max: 217 ns min: 174 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 49 ns max: 81 ns min: 35 ns +async::queue::bulk(16) (time/op) avg: 33 ns max: 60 ns min: 28 ns + async::queue (time/op) avg: 97 ns max: 134 ns min: 51 ns +boost::lockfree::queue (time/op) avg: 185 ns max: 198 ns min: 152 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 36 ns max: 81 ns min: 34 ns +async::queue::bulk(16) (time/op) avg: 30 ns max: 60 ns min: 26 ns + async::queue (time/op) avg: 48 ns max: 89 ns min: 45 ns +boost::lockfree::queue (time/op) avg: 161 ns max: 179 ns min: 120 ns +``` + +e.g. MacOS 10.12.5 Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 +``` +SSingle Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 12 ns max: 37 ns min: 12 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 54 ns min: 25 ns + async::queue (time/op) avg: 23 ns max: 61 ns min: 23 ns +boost::lockfree::queue (time/op) avg: 156 ns max: 172 ns min: 118 ns +boost::lockfree::spsc_queue (time/op) avg: 11 ns max: 30 ns min: 5 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 84 ns max: 98 ns min: 60 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 125 ns min: 24 ns + async::queue (time/op) avg: 104 ns max: 115 ns min: 92 ns +boost::lockfree::queue (time/op) avg: 231 ns max: 326 ns min: 213 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 82 ns max: 100 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 36 ns max: 108 ns min: 31 ns + async::queue (time/op) avg: 102 ns max: 122 ns min: 90 ns +boost::lockfree::queue (time/op) avg: 192 ns max: 229 ns min: 184 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 79 ns max: 93 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 94 ns min: 29 ns + async::queue (time/op) avg: 98 ns max: 116 ns min: 70 ns +boost::lockfree::queue (time/op) avg: 189 ns max: 198 ns min: 175 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 77 ns max: 146 ns min: 56 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 92 ns min: 26 ns + async::queue (time/op) avg: 93 ns max: 167 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 200 ns max: 218 ns min: 182 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 76 ns max: 92 ns min: 48 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 89 ns min: 24 ns + async::queue (time/op) avg: 97 ns max: 140 ns min: 83 ns +boost::lockfree::queue (time/op) avg: 200 ns max: 211 ns min: 163 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 80 ns max: 98 ns min: 59 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 97 ns min: 24 ns + async::queue (time/op) avg: 105 ns max: 122 ns min: 78 ns +boost::lockfree::queue (time/op) avg: 182 ns max: 194 ns min: 153 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 86 ns max: 103 ns min: 64 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 82 ns min: 23 ns + async::queue (time/op) avg: 107 ns max: 127 ns min: 91 ns +boost::lockfree::queue (time/op) avg: 154 ns max: 180 ns min: 146 ns +``` + +e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0 +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 12 ns max: 71 ns min: 11 ns +async::queue::bulk(16) (time/op) avg: 65 ns max: 134 ns min: 24 ns + async::queue (time/op) avg: 48 ns max: 107 ns min: 33 ns +boost::lockfree::queue (time/op) avg: 179 ns max: 198 ns min: 60 ns +boost::lockfree::spsc_queue (time/op) avg: 7 ns max: 47 ns min: 4 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 68 ns max: 505 ns min: 35 ns +async::queue::bulk(16) (time/op) avg: 29 ns max: 135 ns min: 25 ns + async::queue (time/op) avg: 93 ns max: 138 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 234 ns max: 292 ns min: 208 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 68 ns max: 106 ns min: 39 ns +async::queue::bulk(16) (time/op) avg: 35 ns max: 117 ns min: 19 ns + async::queue (time/op) avg: 92 ns max: 135 ns min: 79 ns +boost::lockfree::queue (time/op) avg: 193 ns max: 227 ns min: 175 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 73 ns max: 251 ns min: 49 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 110 ns min: 26 ns + async::queue (time/op) avg: 96 ns max: 178 ns min: 70 ns +boost::lockfree::queue (time/op) avg: 179 ns max: 359 ns min: 164 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 81 ns max: 220 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 114 ns min: 23 ns + async::queue (time/op) avg: 102 ns max: 159 ns min: 74 ns +boost::lockfree::queue (time/op) avg: 177 ns max: 541 ns min: 162 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 83 ns max: 443 ns min: 53 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 297 ns min: 23 ns + async::queue (time/op) avg: 110 ns max: 512 ns min: 79 ns +boost::lockfree::queue (time/op) avg: 176 ns max: 505 ns min: 161 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 83 ns max: 437 ns min: 36 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 261 ns min: 23 ns + async::queue (time/op) avg: 112 ns max: 449 ns min: 84 ns +boost::lockfree::queue (time/op) avg: 178 ns max: 547 ns min: 164 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 90 ns max: 805 ns min: 28 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 78 ns min: 21 ns + async::queue (time/op) avg: 123 ns max: 695 ns min: 80 ns +boost::lockfree::queue (time/op) avg: 195 ns max: 615 ns min: 154 ns +``` + +e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+ +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 67 ns max: 697 ns min: 53 ns +async::queue::bulk(16) (time/op) avg: 144 ns max: 434 ns min: 130 ns + async::queue (time/op) avg: 141 ns max: 441 ns min: 115 ns +boost::lockfree::queue (time/op) avg: 182 ns max: 514 ns min: 168 ns +boost::lockfree::spsc_queue (time/op) avg: 62 ns max: 430 ns min: 53 ns + +Benchmark Test Run: 1 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 72 ns max: 574 ns min: 59 ns +async::queue::bulk(16) (time/op) avg: 141 ns max: 515 ns min: 116 ns + async::queue (time/op) avg: 181 ns max: 590 ns min: 134 ns +boost::lockfree::queue (time/op) avg: 192 ns max: 1045 ns min: 172 ns + +Benchmark Test Run: 2 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 82 ns max: 457 ns min: 65 ns +async::queue::bulk(16) (time/op) avg: 99 ns max: 701 ns min: 84 ns + async::queue (time/op) avg: 124 ns max: 550 ns min: 108 ns +boost::lockfree::queue (time/op) avg: 151 ns max: 847 ns min: 138 ns + +Benchmark Test Run: 3 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 88 ns max: 538 ns min: 67 ns +async::queue::bulk(16) (time/op) avg: 89 ns max: 717 ns min: 71 ns + async::queue (time/op) avg: 131 ns max: 631 ns min: 118 ns +boost::lockfree::queue (time/op) avg: 165 ns max: 644 ns min: 149 ns +``` + +e.g. Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) gcc 6.3.0 on Raspberry Pi 3 B+ +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 227 ns max: 912 ns min: 179 ns +async::queue::bulk(16) (time/op) avg: 442 ns max: 1236 ns min: 365 ns + async::queue (time/op) avg: 423 ns max: 1249 ns min: 364 ns +boost::lockfree::queue (time/op) avg: 474 ns max: 1017 ns min: 410 ns +boost::lockfree::spsc_queue (time/op) avg: 70 ns max: 761 ns min: 48 ns + +Benchmark Test Run: 1 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 241 ns max: 1482 ns min: 187 ns +async::queue::bulk(16) (time/op) avg: 470 ns max: 1259 ns min: 354 ns + async::queue (time/op) avg: 488 ns max: 1482 ns min: 375 ns +boost::lockfree::queue (time/op) avg: 462 ns max: 1158 ns min: 427 ns + + +Benchmark Test Run: 2 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 208 ns max: 348 ns min: 158 ns +async::queue::bulk(16) (time/op) avg: 285 ns max: 543 ns min: 237 ns + async::queue (time/op) avg: 306 ns max: 761 ns min: 234 ns +boost::lockfree::queue (time/op) avg: 334 ns max: 1481 ns min: 261 ns + + +Benchmark Test Run: 3 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 241 ns max: 884 ns min: 192 ns +async::queue::bulk(16) (time/op) avg: 210 ns max: 651 ns min: 180 ns + async::queue (time/op) avg: 439 ns max: 682 ns min: 375 ns +boost::lockfree::queue (time/op) avg: 420 ns max: 903 ns min: 320 ns +``` + +## coding style +all code has been formated by clang-format. It may be more easy to read in text editor or may be not :) + +## Many Thanks to 3rd party and their developers +* [Boost](http://www.boost.org/) +* [Boost CMake](https://github.com/Orphis/boost-cmake) Easy Boost integration in CMake projects! +* [Catch](https://github.com/philsquared/Catch) A powerful test framework for unit test. +* [cpprestsdk](https://github.com/Microsoft/cpprestsdk) The C++ REST SDK is a Microsoft project for cloud-based client-server communication in native code using a modern asynchronous C++ API design. +* [rlutil](https://github.com/tapio/rlutil) provides cross-platform console-mode functions to position and colorize text. +* [sakaki](https://github.com/sakaki-/gentoo-on-rpi3-64bit) Bootable 64-bit Gentoo image for the Raspberry Pi 3 B / B+, with Linux 4.14 diff --git a/src/3rdparty/async/bounded_queue.h b/src/3rdparty/async/bounded_queue.h new file mode 100644 index 000000000..341e5f307 --- /dev/null +++ b/src/3rdparty/async/bounded_queue.h @@ -0,0 +1,342 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// + +#pragma once + +#include "utility.h" +#include +#include +#include + +namespace async { + +struct bounded_traits { + static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag + static constexpr std::size_t CachelineSize = 64; + static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1 + using sequence_type = std::uint64_t; +}; + +template class bounded_queue { +private: + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + +public: + static constexpr std::size_t cacheline_size = TRAITS::CachelineSize; + static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment; + using seq_t = typename TRAITS::sequence_type; + explicit bounded_queue(std::size_t size) + : fastmodulo((size > 0 && ((size & (size - 1)) == 0))), + bitshift(fastmodulo ? getShiftBitsCount(size) : 0), + elements(new element[size]), mask(fastmodulo ? size - 1 : 0), + qsize(size), enqueueIx(0), dequeueIx(0) { + assert(qsize > 0); // any size <= 0 is illegal + } + bounded_queue(bounded_queue const &) = delete; + bounded_queue(bounded_queue &&) = delete; + bounded_queue &operator=(bounded_queue const &) = delete; + bounded_queue &operator=(bounded_queue &&) = delete; + ~bounded_queue() { delete[] elements; } + std::size_t size() { return qsize; } + + template ::value>::type> + inline void blocking_enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(enqidx)]; + auto enq_tkt = ticket(enqidx); + while (enq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + ele.construct(std::forward(args)...); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + } + + template ::value>::type> + inline bool blocking_enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(enqidx)]; + auto enq_tkt = ticket(enqidx); + while (enq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + if (ele.construct(std::forward(args)...)) { + ele.hasdata.store(true, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.hasdata.store(false, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return false; + } + } + + template ::value, + int>::type = 0> + inline bool enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(enqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t enq_tkt = ticket(enqidx); + seq_t diff = tkt - enq_tkt; + if (diff == 0) { + if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1, + std::memory_order_release, + std::memory_order_relaxed)) { + ele.construct(std::forward(args)...); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is full + else + enqidx = enqueueIx.load(std::memory_order_acquire); + } + } + + template ::value, + int>::type = 0> + inline bool enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + for (;;) { + auto &ele = elements[index(enqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t enq_tkt = ticket(enqidx); + seq_t diff = tkt - enq_tkt; + if (diff == 0) { + if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1, + std::memory_order_release, + std::memory_order_relaxed)) { + if (ele.construct(std::forward(args)...)) { + ele.hasdata.store(true, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.hasdata.store(false, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return false; + } + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is full + else + enqidx = enqueueIx.load(std::memory_order_acquire); + } + } + + template ::value>::type> + inline void blocking_dequeue(U &data) noexcept { + auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(deqidx)]; + seq_t deq_tkt = ticket(deqidx) + 1; + while (deq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + } + + template ::value>::type> + inline bool blocking_dequeue(U &data) noexcept { + auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(deqidx)]; + seq_t deq_tkt = ticket(deqidx) + 1; + while (deq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + if (ele.hasdata.load(std::memory_order_acquire)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return false; + } + } + + template ::value, + int>::type = 0> + inline bool dequeue(U &data) { + + auto deqidx = dequeueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(deqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t deq_tkt = ticket(deqidx) + 1; + seq_t diff = tkt - deq_tkt; + if (diff == 0) { + if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is empty + else { + + deqidx = dequeueIx.load(std::memory_order_acquire); + } + } + } + + template < + typename U = T, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline bool + dequeue(U &data) // false could be queue is empty, or skip an invalid element + { + + auto deqidx = dequeueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(deqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t deq_tkt = ticket(deqidx) + 1; + seq_t diff = tkt - deq_tkt; + if (diff == 0) { + if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + if (ele.hasdata.load(std::memory_order_acquire)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return false; + } + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is empty + else { + deqidx = dequeueIx.load(std::memory_order_acquire); + } + } + } + +private: + inline seq_t index(seq_t const seq) { + if (fastmodulo) + return seq & mask; + else + return seq >= qsize ? seq % qsize : seq; + } + + inline seq_t ticket(seq_t const seq) { + if (fastmodulo) + return (seq >> bitshift) << 1; + else + return (seq / static_cast(qsize)) << 1; + } + //TODO& Review: replace the following with c++ concepts + template struct checkdata {}; + + template + struct checkdata::value>::type> {}; + + template + struct checkdata::value>::type> { + checkdata() : hasdata(false) {} + std::atomic hasdata; + }; + + struct element : public checkdata { + element() : tkt(0) {} + ~element() { + if (tkt & 1) // enqueue op visited + destruct(); + } + + template ::value>::type> + inline void construct(Args &&... args) noexcept { + new (&storage) T(std::forward(args)...); + } + + template ::value>::type> + inline bool construct(Args &&... args) noexcept { + try { + new (&storage) T(std::forward(args)...); + } catch (...) { + return false; + } + return true; + } + + inline void destruct() noexcept { reinterpret_cast(&storage)->~T(); } + + inline T *getptr() { return reinterpret_cast(&storage); } + + template < + typename U = T, // NON-SAFE + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + data = std::move(*getptr()); + destruct(); + } + + template < + typename U = T, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + try { + data = std::move(*getptr()); + } catch (...) { + } + destruct(); + } + + std::atomic tkt; + typename std::aligned_storage::type storage; + std::atomic hasdata; + }; + + bool const fastmodulo; // true if qsize is power of 2 + int const bitshift; // used if fastmodulo is true + element *const elements; // pointer to buffer + std::size_t const mask; // used if fastmodulo is true + std::size_t const qsize; // queue size + alignas(cacheline_alignment) char cacheline_padding1[cacheline_size]; + alignas(cacheline_alignment) std::atomic enqueueIx; + alignas(cacheline_alignment) char cacheline_padding2[cacheline_size]; + alignas(cacheline_alignment) std::atomic dequeueIx; + alignas(cacheline_alignment) char cacheline_padding3[cacheline_size]; +}; +} // namespace async diff --git a/src/3rdparty/async/queue.h b/src/3rdparty/async/queue.h new file mode 100644 index 000000000..6b00d1d61 --- /dev/null +++ b/src/3rdparty/async/queue.h @@ -0,0 +1,429 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once +#include "utility.h" +#include +#include +#include + +namespace async { +struct traits // 3-level (L3, L2, L1) depth of nested group design, total + // indexing space is pow(2, 64-Tagbits) +{ // user can change the bits settings by providing your own TRAITS + static constexpr std::uint64_t Tagbits = 24; + static constexpr std::uint64_t L3bits = 10; + static constexpr std::uint64_t L2bits = 10; + static constexpr std::uint64_t L1bits = 12; + static constexpr std::uint64_t Basebits = 8; + static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag + static constexpr std::size_t CachelineSize = 64; + static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1 +}; + +template class queue final { +public: + static bool is_lock_free_v() { + return std::atomic{}.is_lock_free(); + } + static constexpr std::size_t cacheline_size = TRAITS::CachelineSize; + static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment; + static constexpr std::uint64_t BaseMask = getBitmask(TRAITS::Basebits); + static constexpr std::uint64_t L1Mask = getBitmask(TRAITS::L1bits) + << TRAITS::Basebits; + static constexpr std::uint64_t L2Mask = getBitmask(TRAITS::L2bits) + << (TRAITS::Basebits + TRAITS::L1bits); + static constexpr std::uint64_t L3Mask = + getBitmask(TRAITS::L3bits) + << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits); + static constexpr std::uint64_t TagMask = + getBitmask(TRAITS::Tagbits) + << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits + TRAITS::L3bits); + static constexpr std::uint64_t TagShift = 64 - TRAITS::Tagbits; + static constexpr std::uint64_t TagPlus1 = static_cast(1) << TagShift; + +public: // assert bits settings meet requirements + static_assert(TRAITS::Tagbits + TRAITS::L3bits + TRAITS::L2bits + + TRAITS::L1bits + TRAITS::Basebits == + 64, + "The sum of all bits settings should be 64"); + static_assert(TRAITS::Tagbits > 0 && TRAITS::L3bits > 0 && + TRAITS::L2bits > 0 && TRAITS::L1bits > 0 && + TRAITS::Basebits > 3, + "All bits settings should be > 0 and Basebits must be > 3"); + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + +public: + queue() : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) { + container.get(index(0)); // allocate initial space + } + queue(std::size_t size) // pre-allocate size + : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) { + container.get(index(0)); + + if (size > (static_cast(1) << TRAITS::Basebits)) { + index ix; + for (std::size_t i = (static_cast(1) << TRAITS::Basebits); i < size; + ++i) { + auto &node = getNode(ix); + recycle(ix); + } + } + } + + queue(queue const &other) = delete; + queue &operator=(queue const &other) = delete; + queue(queue &&other) = delete; + queue &operator=(queue &&other) = delete; + + template ::value>::type> + inline void enqueue(Args &&... args) noexcept { + auto ix = encapsulate(std::forward(args)...); + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(ix, std::memory_order_release); + } + + template ::value>::type> + inline bool enqueue(Args &&... args) noexcept { + auto ix = encapsulate(std::forward(args)...); + if (ix == 0) + return false; + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(ix, std::memory_order_release); + return true; + } + + template void bulk_enqueue(IT it, std::size_t count) { + index firstidx(0), preidx(0), lastidx(0); + for (std::size_t i = 0; i < count; ++i) { + lastidx = encapsulate(*it++); + if (firstidx == 0) + firstidx = lastidx; + if (preidx != 0) { + container[preidx].next.store(lastidx, std::memory_order_relaxed); + } + preidx = lastidx; + } + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, lastidx, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(firstidx, std::memory_order_release); + } + + template + std::size_t bulk_dequeue(IT &&it, std::size_t maxcount) // or IT& it to return the + { + std::size_t count(0); + while (maxcount-- && dequeue(*it++)) { + ++count; + } + return count; + } + + template // U could be T, or any kinds of iterators/adapters, + // like insert_iterator + inline bool dequeue(U &data) noexcept // return false if queue is empty + { + for (;;) { + auto deqidx = dequeueIx.load(std::memory_order_acquire); + auto &node = container[deqidx]; + auto next = node.next.load(std::memory_order_relaxed); + if (next == 0) { + auto ready_for_consume = + node.consume_ready.load(std::memory_order_relaxed); + if (!ready_for_consume) { + return false; + } + + if (node.consume_ready.compare_exchange_strong( + ready_for_consume, false, std::memory_order_release, + std::memory_order_relaxed)) { + node.template move(data); + return true; + } + } else { + if (dequeueIx.compare_exchange_weak(deqidx, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + auto ready_for_consume = + node.consume_ready.load(std::memory_order_acquire); + if (ready_for_consume && + node.consume_ready.compare_exchange_strong( + ready_for_consume, false, std::memory_order_release, + std::memory_order_relaxed)) { + node.template move(data); + } else { // the node is being consumed by another thread, waiting for + // it finishes + for (; !node.recycle_ready.load(std::memory_order_acquire);) { + } + } + node.next.store( + 0, std::memory_order_relaxed); // reset link to avoid chain effect + recycle(deqidx); + if (ready_for_consume) + return ready_for_consume; + } + } + } + } + std::uint64_t getNodeCount() { return nodeCount; } // get in-use-nodes count + +private: // internal data structures + struct index // simulate tagged pointer + { + index(std::uint64_t newval) noexcept + : value(newval) {} // is_trivially_copyable must be true + index() noexcept : value(0) {} + inline operator std::uint64_t() const { return value; } + std::uint64_t getVersion() { return (value & TagMask) >> TagShift; } + inline void increTag() { + value = (value & ~TagMask) | ((value + TagPlus1) & TagMask); + } + std::uint64_t value; + }; + + struct node // to store the data + { + node() : next(0), consume_ready(false), recycle_ready(true) {} + ~node() noexcept { + if (consume_ready.load(std::memory_order_relaxed)) { + destruct(); + } + } + + template ::value>::type> + inline void construct(Args &&... args) noexcept { + new (&storage) T(std::forward(args)...); + consume_ready.store(true, std::memory_order_release); + recycle_ready.store(false, std::memory_order_release); + } + + template ::value>::type> + inline bool construct(Args &&... args) noexcept { + try { + new (&storage) T(std::forward(args)...); + } catch (...) { + return false; + } + + consume_ready.store(true, std::memory_order_release); + recycle_ready.store(false, std::memory_order_release); + return true; + } + + inline void destruct() noexcept { reinterpret_cast(&storage)->~T(); } + + template < + typename TR, typename U, // NON-SAFE + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + data = std::move(*getptr()); + destruct(); + recycle_ready.store(true, std::memory_order_release); + } + + template < + typename TR, typename U, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + try { + data = std::move(*getptr()); + } catch (...) { + } + destruct(); + recycle_ready.store(true, std::memory_order_release); + } + inline T *getptr() { return reinterpret_cast(&storage); } + std::atomic next; // link + std::atomic consume_ready; // if true, consume ready + std::atomic recycle_ready; // if true, recycle ready + typename std::aligned_storage::type storage; // data + }; + + struct basecontainer { + inline node &get(index const &ix) { return operator[](ix); } + inline node &at(index const &ix) { return operator[](ix); } + inline node &operator[](index const &ix) { return nodes[ix & BaseMask]; } + std::array(1) << TRAITS::Basebits> nodes; + }; + + template struct nestedcontainer { + static constexpr std::uint64_t mask = BitMask; + static constexpr std::uint64_t bits = getSetBitsCount(mask); + static constexpr std::uint64_t shift = getShiftBitsCount(mask); + std::array, static_cast(1) << bits> + subgroups; + nestedcontainer() { + for (auto &gptr : subgroups) { + gptr.store(nullptr, std::memory_order_release); + } + } + ~nestedcontainer() { + for (auto &gptr : subgroups) { + if (gptr.load(std::memory_order_relaxed) != nullptr) + delete gptr.load(std::memory_order_relaxed); + } + } + + inline node &get(index const &ix) // will trigger the new operation if + // subgroup doesn't exist + { + auto ptr = + subgroups[(ix & mask) >> shift].load(std::memory_order_acquire); + if (ptr == nullptr) { + auto newgroup = std::make_unique(); // if ComExch fails, + // unique_ptr will self + // delete + if (subgroups[(ix & mask) >> shift].compare_exchange_strong( + ptr, newgroup.get(), std::memory_order_release, + std::memory_order_acquire)) { + ptr = newgroup.release(); + } + } + return ptr->get(ix); // recursively calling get 'til get the node + } + + inline node &operator[](index const &ix) { + return subgroups[(ix & mask) >> shift] + .load(std::memory_order_relaxed) + -> + operator[](ix); + } + + inline node &at(index const &ix) { // balanced performance and safety + auto ptr = + subgroups[(ix & mask) >> shift].load(std::memory_order_relaxed); + if (ptr) + return ptr->at(ix); + else + return get(ix); + } + }; + + inline node &getNode(index &ix) { // return an existing or new node + #if defined(__arm__) && (!defined(__aarch64__)) + //for ARMV7 or below + ix.value = nodeCount.load(std::memory_order_relaxed); + auto val = ix.value + 1; + while(!nodeCount.compare_exchange_weak( + ix.value, val, std::memory_order_release, std::memory_order_relaxed)) { + val = ix.value + 1; + } + #else + ix.value = nodeCount.fetch_add(static_cast(1), + std::memory_order_relaxed); + #endif + if ((ix.value & BaseMask) == 0) + return container.get(ix); + else + return container.at(ix); + } + + template ::value, + int>::type = 0> + inline index encapsulate(Args &&... args) noexcept { + auto ix = spawn(); + auto &node = container[ix]; + node.construct(std::forward(args)...); + node.next.store(0, std::memory_order_relaxed); + return ix; + } + + template ::value, + int>::type = 0> + inline index encapsulate(Args &&... args) noexcept { + auto ix = spawn(); + auto &node = container[ix]; + node.next.store(0, std::memory_order_relaxed); + if (node.construct(std::forward(args)...)) + return ix; + else { + recycle(ix); // construction failed, recycle the node + return index(0); + } + } + + inline void recycle(index const &ix) { + auto recycle = recycleIx.load(std::memory_order_relaxed); + while (!recycleIx.compare_exchange_weak( + recycle, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[recycle].next.store(ix, std::memory_order_release); + } + + inline auto spawn() +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER < 1800) + -> index +#endif + { + index ix(0); + for (;;) { + auto spaidx = spawnIx.load(std::memory_order_acquire); + auto next = container[spaidx].next.load(std::memory_order_relaxed); + if (next == 0) { + getNode(ix); + return ix; + } else { + if (spawnIx.compare_exchange_weak(spaidx, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + if (spaidx != 0) { + spaidx.increTag(); + } + return spaidx; + } + } + } + } + + using L1container = nestedcontainer; + using L2container = nestedcontainer; + nestedcontainer container; + alignas(cacheline_alignment) char cacheline_padding1[cacheline_size]; + alignas(cacheline_alignment) std::atomic nodeCount; // # of allocated nodes, not the # + // of elements stored in the queue + alignas(cacheline_alignment) char cacheline_padding2[cacheline_size]; + alignas(cacheline_alignment) std::atomic dequeueIx; // dequeue pointer + alignas(cacheline_alignment) char cacheline_padding3[cacheline_size]; + alignas(cacheline_alignment) std::atomic enqueueIx; // enqueue pointer + alignas(cacheline_alignment) char cacheline_padding4[cacheline_size]; + alignas(cacheline_alignment) std::atomic spawnIx; // spawn pointer + alignas(cacheline_alignment) char cacheline_padding5[cacheline_size]; + alignas(cacheline_alignment) std::atomic recycleIx; // recycle pointer + alignas(cacheline_alignment) char cacheline_padding6[cacheline_size]; +}; +} // namespace async diff --git a/src/3rdparty/async/threadpool.h b/src/3rdparty/async/threadpool.h new file mode 100644 index 000000000..395a9d850 --- /dev/null +++ b/src/3rdparty/async/threadpool.h @@ -0,0 +1,192 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once +#include "queue.h" +#include +#include +#include +#include +#include +#include +#include +#include +namespace async { +// thread pool to execute functions, functors, lamdas asynchronously, +// default poolsize = machine's logical CPU cores/threads +class threadpool final { +public: + static int defaultpoolsize() { return std::thread::hardware_concurrency(); } + + threadpool(int poolsize = defaultpoolsize()) + : idlecount(0), conflag(false) { + configurepool(poolsize); + } + + threadpool(const threadpool &) = delete; + threadpool(threadpool &&) = delete; + threadpool &operator=(const threadpool &) = delete; + threadpool &operator=(threadpool &&) = delete; + + ~threadpool() { cleanup(); } + + inline std::size_t size() { + std::lock_guard lg(poolmux); + return threads.size(); + } + + inline int idlesize() { return idlecount; } + + // can be called to resize the pool at any time after construction and before + // destruction, recommand to be called from main thread or manager thread even + // though it is thread-safe + void configurepool(std::size_t poolsize) { + std::unique_lock veclk(poolmux); + auto currentsize = threads.size(); + if (currentsize < poolsize) { // expand the pool + for (std::size_t i = currentsize; i < poolsize; i++) { + tpstops.emplace_back(addthread()); + } + } else if (currentsize > poolsize) { // shrink the pool + std::vector> dumpthreads; + std::vector *> dumpthreadstops; + std::move(threads.begin() + poolsize, threads.end(), + std::back_inserter(dumpthreads)); + std::move(tpstops.begin() + poolsize, tpstops.end(), + std::back_inserter(dumpthreadstops)); + tpstops.resize(poolsize); + threads.resize(poolsize); + veclk.unlock(); + for (auto &a : dumpthreadstops) { + *a = true; + } + for (auto &t : dumpthreads) { + t->detach(); + } + { + std::unique_lock lk(qcvmux); // suspended threads to quit + qcv.notify_all(); + } + } + } + + template + inline auto post(Func &&func, Args &&... args) +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER <= 1800) + -> std::future::type> +#endif + { // TODO: replace result_of with invoke_result_t when migrate to c++17 + auto taskptr = std::make_shared< + std::packaged_task::type()>>( + std::bind(std::forward(func), std::forward(args)...)); + taskqueue.enqueue([taskptr]() { (*taskptr)(); }); + { + std::lock_guard lg(qcvmux); + conflag = true; + } + qcv.notify_one(); + return taskptr->get_future(); + } + + template + inline auto post(Func &&func) +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER <= 1800) + -> std::future::type> +#endif + { // a special case for func() type without any parameters, might be + // removed later + auto taskptr = std::make_shared< + std::packaged_task::type()>>( + std::forward(func)); + taskqueue.enqueue([taskptr]() { (*taskptr)(); }); + { + std::lock_guard lg(qcvmux); + conflag = true; + } + qcv.notify_one(); + return taskptr->get_future(); + } + +private: + struct executor { + executor(std::unique_ptr> &&ptr, threadpool &pool) + : stop(std::move(ptr)), thpool(pool) {} + void operator()() { + while (!*stop) { + if (!thpool.executetask_in_loop(*stop)) { + return; // signaled to quit + } + thpool.wait_for_task(*stop); // wait for new task + } + } + + private: + std::unique_ptr> stop; + threadpool &thpool; + }; + + std::atomic *addthread() { + auto stopuniptr = std::make_unique>(false); + auto stoprawptr = stopuniptr.get(); + threads.emplace_back( + std::make_unique(executor(std::move(stopuniptr), *this))); + return stoprawptr; + } + + void cleanup() { // make sure no more tasks being pushed to the taskqueue + { + std::lock_guard lk(qcvmux); + qcv.notify_all(); // let running thread drain the task queue? no need, + // should be removed + } + for (auto &stop : tpstops) { + *stop = true; // stop signaled + } + { + std::lock_guard lk(qcvmux); + qcv.notify_all(); // notify again + } + for (auto &thread : threads) { + if (thread->joinable()) + thread->join(); + } + threads.clear(); + tpstops.clear(); + } + + inline void wait_for_task(std::atomic const &stop) { + idlecount.fetch_add(1, std::memory_order_relaxed); + { + std::unique_lock lk(qcvmux); + qcv.wait(lk, [&]() { + return conflag || stop.load(std::memory_order_acquire); + }); //memory_oder can be removed + conflag = false; + } + idlecount.fetch_sub(1, std::memory_order_relaxed); + } + + inline bool executetask_in_loop(std::atomic const &stop) { + std::function func; + for (; taskqueue.dequeue(func);) { + func(); + if (stop) // stop is signaled + return false; + } + return true; + } + + std::vector> threads; + std::vector *> tpstops; // threads terminate flags + async::queue> taskqueue; + std::atomic idlecount; // idle thread count + std::mutex qcvmux, poolmux; + std::condition_variable qcv; + bool conflag; // continue flag for cv +}; +} // namespace async diff --git a/src/3rdparty/async/utility.h b/src/3rdparty/async/utility.h new file mode 100644 index 000000000..f5bb2d1f4 --- /dev/null +++ b/src/3rdparty/async/utility.h @@ -0,0 +1,66 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once + +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus < 201103L) || \ + (defined(_MSC_VER) && _MSC_VER < 1800) +#error This library needs at least a C++11 compliant compiler +#endif +#include +#include +#include +#include +template static constexpr T getBitmask(unsigned int const bits) { + return static_cast(-(bits != 0)) & + (static_cast(-1) >> ((sizeof(T) * CHAR_BIT) - bits)); +} + +#if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1910) +// c++14 impl +static constexpr unsigned int getSetBitsCount(std::uint64_t n) { + unsigned int count{0}; + while (n) { + n &= (n - 1); + count++; + } + return count; +} + +static constexpr unsigned int getShiftBitsCount(std::uint64_t n) { + // requires c++14 + unsigned int count{0}; + if (n == 0) + return count; + while ((n & 0x1) == 0) { + n >>= 1; + ++count; + } + return count; +} + +#elif __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800) +// c++11 impl +static constexpr unsigned int getSetBitsCount(std::uint64_t n) { + return n == 0 ? 0 : 1 + getSetBitsCount(n & (n - 1)); +} + +static constexpr unsigned int getShiftBitsCount(std::uint64_t n) { + return n == 0 ? 0 : ((n & 0x1) == 0 ? 1 + getShiftBitsCount(n >> 1) : 0); +} + +#if (__cplusplus == 201103L) && (defined(__clang__) || defined(__GNUC__)) +namespace std { // for c+11 +template +std::unique_ptr make_unique(Args &&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} +} // namespace std +#endif + +#else +#error This library needs at least a C++11 compliant compiler +#endif diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h index 520820c02..e25260909 100644 --- a/src/TNL/Containers/ByteArraySynchronizer.h +++ b/src/TNL/Containers/ByteArraySynchronizer.h @@ -12,7 +12,13 @@ #pragma once +#include +// 3rd-party async library providing a thread-pool +#include + #include +#include +#include namespace TNL { namespace Containers { @@ -20,12 +26,121 @@ namespace Containers { template< typename Device, typename Index > class ByteArraySynchronizer { +private: + // NOTE: async::threadpool has alignment requirements, which causes problems: + // - it may become misaligned in derived classes, see e.g. + // https://stackoverflow.com/a/46475498 + // solution: specify it as the first member of the base class + // - operator new before C++17 may not support over-aligned types, see + // https://stackoverflow.com/a/53485295 + // solution: relaxed alignment requirements to not exceed the value of + // alignof(std::max_align_t), which is the strongest alignment supported + // by plain new. See https://github.com/d36u9/async/pull/2 + async::threadpool tp; + + int gpu_id = 0; + public: using ByteArrayView = ArrayView< std::uint8_t, Device, Index >; + using RequestsVector = std::vector< typename Communicators::MpiCommunicator::Request >; + + enum class AsyncPolicy { + synchronous, + deferred, + threadpool, + async, + }; + + ByteArraySynchronizer() : tp(1) {} + + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) = 0; + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) = 0; + + /** + * \brief An asynchronous version of \ref synchronizeByteArray. + * + * Note that this method is not thread-safe - only the thread which created + * and "owns" the instance of this object can call this method. + * + * Note that at most one async operation may be active at a time, the + * following calls will block until the pending operation is finished. + */ + void synchronizeByteArrayAsync( ByteArrayView array, int bytesPerValue, AsyncPolicy policy = AsyncPolicy::synchronous ) + { + // wait for any previous synchronization (multiple objects can share the + // same synchronizer) + if( async_op.valid() ) { + async_wait_before_start_timer.start(); + async_op.wait(); + async_wait_before_start_timer.stop(); + } - virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) = 0; + async_start_timer.start(); + + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaGetDevice(&gpu_id); + #endif + + if( policy == AsyncPolicy::threadpool || policy == AsyncPolicy::async ) { + // everything offloaded to a separate thread + auto worker = [=] () { + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaSetDevice(this->gpu_id); + #endif + + this->synchronizeByteArray( array, bytesPerValue ); + }; + + if( policy == AsyncPolicy::threadpool ) + async_op = tp.post( worker ); + else + async_op = std::async( std::launch::async, worker ); + } + else if( policy == AsyncPolicy::deferred ) { + // immediate start, deferred synchronization (but still in the same thread) + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + auto worker = [requests] () mutable { + Communicators::MpiCommunicator::WaitAll( requests.data(), requests.size() ); + }; + this->async_op = std::async( std::launch::deferred, worker ); + } + else { + // synchronous + synchronizeByteArray( array, bytesPerValue ); + } + + async_ops_count++; + async_start_timer.stop(); + } virtual ~ByteArraySynchronizer() = default; + + /** + * \brief Can be used for checking if a synchronization started + * asynchronously has been finished. + * + * Note that derived classes *must* make this check in the destructor, + * otherwise running \ref synchronizeByteArrayAsync would lead to the error + * `pure virtual method called` when the derived object is destructed before + * the async operation finishes. This cannot be implemented in the base class + * destructor, because the derived destructor is run first. + * + * ~Derived() + * { + * if( this->async_op.valid() ) + * this->async_op.wait(); + * } + */ + std::future< void > async_op; + + // attributes for profiling + Timer async_wait_before_start_timer, async_start_timer, async_wait_timer; + std::size_t async_ops_count = 0; }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h index c1571bc9e..33e96ca9a 100644 --- a/src/TNL/Containers/DistributedArray.h +++ b/src/TNL/Containers/DistributedArray.h @@ -49,6 +49,8 @@ public: using Self = DistributedArray< _Value, _Device, _Index, _Communicator >; + ~DistributedArray(); + DistributedArray() = default; // Copy-constructor does deep copy. diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp index c23d0a7e4..cd0eb49d5 100644 --- a/src/TNL/Containers/DistributedArray.hpp +++ b/src/TNL/Containers/DistributedArray.hpp @@ -20,6 +20,18 @@ namespace TNL { namespace Containers { +template< typename Value, + typename Device, + typename Index, + typename Communicator > +DistributedArray< Value, Device, Index, Communicator >:: +~DistributedArray() +{ + // Wait for pending async operation, otherwise the synchronizer would crash + // if the array goes out of scope. + waitForSynchronization(); +} + template< typename Value, typename Device, typename Index, diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index bf63f8cc6..0a9aef1a4 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -51,6 +51,8 @@ public: using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >; + ~DistributedArrayView(); + // Initialization by raw data DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData) @@ -107,6 +109,8 @@ public: int getValuesPerElement() const; + // Note that this method is not thread-safe - only the thread which created + // and "owns" the instance of this object can call this method. void startSynchronization(); void waitForSynchronization() const; diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index cb95427fc..65654a54d 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -17,6 +17,20 @@ namespace TNL { namespace Containers { +template< typename Value, + typename Device, + typename Index, + typename Communicator > +DistributedArrayView< Value, Device, Index, Communicator >:: +~DistributedArrayView() +{ + // Wait for pending async operation, otherwise the synchronizer might crash + // if the view goes out of scope. + // (The same thing is done even in DistributedArray, but there might be views + // bound to an array without a synchronizer, in which case this helps.) + waitForSynchronization(); +} + template< typename Value, typename Device, typename Index, @@ -234,14 +248,9 @@ startSynchronization() // like linear solvers...) TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" ); - // wait for any previous synchronization (in case the array was inconsistently modified - // while a synchronization was in progress) - waitForSynchronization(); - typename SynchronizerType::ByteArrayView bytes; bytes.bind( reinterpret_cast( localData.getData() ), sizeof(ValueType) * localData.getSize() ); - // TODO: implement the async stuff - synchronizer->synchronizeByteArray( bytes, sizeof(ValueType) * valuesPerElement ); + synchronizer->synchronizeByteArrayAsync( bytes, sizeof(ValueType) * valuesPerElement ); } template< typename Value, @@ -252,7 +261,11 @@ void DistributedArrayView< Value, Device, Index, Communicator >:: waitForSynchronization() const { - // TODO: implement the async stuff + if( synchronizer && synchronizer->async_op.valid() ) { + synchronizer->async_wait_timer.start(); + synchronizer->async_op.wait(); + synchronizer->async_wait_timer.stop(); + } } diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h index 75e958734..32ba735e5 100644 --- a/src/TNL/Containers/Partitioner.h +++ b/src/TNL/Containers/Partitioner.h @@ -82,6 +82,14 @@ public: public: using ByteArrayView = typename Base::ByteArrayView; + using RequestsVector = typename Base::RequestsVector; + + ~ArraySynchronizer() + { + // wait for pending async operation, otherwise it would crash + if( this->async_op.valid() ) + this->async_op.wait(); + } ArraySynchronizer() = delete; @@ -89,7 +97,13 @@ public: : localRange(localRange), overlaps(overlaps), group(group) {} - virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override + { + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + Communicator::WaitAll( requests.data(), requests.size() ); + } + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override { TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps), "unexpected array size" ); @@ -122,8 +136,7 @@ public: bytesPerValue * overlaps, right, 0, group ) ); - // wait for all communications to finish - Communicator::WaitAll( requests.data(), requests.size() ); + return requests; } }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h index 225d1a2df..382de6905 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h @@ -42,6 +42,14 @@ public: using GlobalIndexType = typename DistributedMesh::GlobalIndexType; using CommunicatorType = typename DistributedMesh::CommunicatorType; using ByteArrayView = typename Base::ByteArrayView; + using RequestsVector = typename Base::RequestsVector; + + ~DistributedMeshSynchronizer() + { + // wait for pending async operation, otherwise it would crash + if( this->async_op.valid() ) + this->async_op.wait(); + } DistributedMeshSynchronizer() = default; @@ -52,12 +60,6 @@ public: TNL_ASSERT_EQ( mesh.template getGlobalIndices< EntityDimension >().getSize(), mesh.getLocalMesh().template getEntitiesCount< EntityDimension >(), "Global indices are not allocated properly." ); - // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ - #ifdef HAVE_CUDA - if( std::is_same< DeviceType, Devices::Cuda >::value ) - cudaGetDevice(&this->gpu_id); - #endif - group = mesh.getCommunicationGroup(); const int rank = CommunicatorType::GetRank( group ); const int nproc = CommunicatorType::GetSize( group ); @@ -127,7 +129,7 @@ public: // send indices of ghost entities - set them as ghost neighbors on the target rank { - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; // send our ghost indices to the neighboring ranks GlobalIndexType ghostOffset = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >(); @@ -196,17 +198,17 @@ public: synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement ); } - virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override + { + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + CommunicatorType::WaitAll( requests.data(), requests.size() ); + } + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override { TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ], "The array does not have the expected size." ); - // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ - #ifdef HAVE_CUDA - if( std::is_same< DeviceType, Devices::Cuda >::value ) - cudaSetDevice(gpu_id); - #endif - const int rank = CommunicatorType::GetRank( group ); const int nproc = CommunicatorType::GetSize( group ); @@ -214,7 +216,7 @@ public: sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; // issue all receive async operations for( int j = 0; j < nproc; j++ ) { @@ -250,8 +252,7 @@ public: } } - // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + return requests; } // performs a synchronization of a sparse matrix @@ -271,7 +272,7 @@ public: const int nproc = CommunicatorType::GetSize( group ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; Containers::Array< GlobalIndexType, Devices::Host, int > send_rankOffsets( nproc + 1 ), recv_rankOffsets( nproc + 1 ); Containers::Array< GlobalIndexType, Devices::Host, GlobalIndexType > send_rowCapacities, send_rowPointers, send_columnIndices, recv_rowPointers, recv_columnIndices; @@ -350,7 +351,7 @@ public: // allocate row pointers recv_rowPointers.setSize( recv_rankOffsets[ nproc ] + 1 ); - std::vector< typename CommunicatorType::Request > row_lengths_requests; + RequestsVector row_lengths_requests; // set row pointers GlobalIndexType rowPtr = 0; @@ -443,9 +444,6 @@ public: } protected: - // GOTCHA (see above) - int gpu_id = 0; - // communication group taken from the distributed mesh typename CommunicatorType::CommunicationGroup group; diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index d201a0a09..f594a081b 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -104,6 +104,7 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal ) ArrayType globalArray( this->globalSize ); setLinearSequence( globalArray ); this->distributedArray.copyFromGlobal( globalArray ); + this->distributedArray.waitForSynchronization(); const auto localRange = this->distributedArray.getLocalRange(); ArrayViewType localArrayView; @@ -151,6 +152,7 @@ TYPED_TEST( DistributedArrayTest, setValue ) using ArrayType = typename TestFixture::ArrayType; this->distributedArray.setValue( 1.0 ); + this->distributedArray.waitForSynchronization(); ArrayViewType localArrayView = this->distributedArray.getLocalView(); ArrayType expected( localArrayView.getSize() ); expected.setValue( 1.0 ); @@ -163,6 +165,7 @@ TYPED_TEST( DistributedArrayTest, setValueGhosts ) using ArrayType = typename TestFixture::ArrayType; this->distributedArray.setValue( this->rank ); + this->distributedArray.waitForSynchronization(); ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts(); ArrayType expected( localArrayView.getSize() ); expected.setValue( this->rank ); @@ -184,6 +187,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) using IndexType = typename TestFixture::IndexType; this->distributedArray.setValue( 0 ); + this->distributedArray.waitForSynchronization(); ArrayViewType localArrayView = this->distributedArray.getLocalView(); const auto localRange = this->distributedArray.getLocalRange(); @@ -214,6 +218,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) } this->distributedArray.setValue( 0 ); + this->distributedArray.waitForSynchronization(); // use operator[] if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { @@ -322,6 +327,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue ) EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) ); this->distributedArray.setValue( 100 ); + this->distributedArray.waitForSynchronization(); EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) ); } -- GitLab From 9a88469e711e804d98aff30c1538118989ab5df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 29 Dec 2020 21:11:37 +0100 Subject: [PATCH 35/50] MPI refactoring: split MpiCommunicator into plain functions in the TNL::MPI namespace --- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 4 +- .../tnl-benchmark-linear-solvers.h | 4 +- .../ODESolvers/tnl-benchmark-ode-solvers.h | 10 +- src/TNL/Communicators/MPITypeResolver.h | 108 ------ src/TNL/Communicators/MpiCommunicator.h | 273 ++------------ src/TNL/Containers/DistributedArray.hpp | 1 - .../Expressions/DistributedComparison.h | 2 +- .../DistributedVerticalOperations.h | 2 +- src/TNL/MPI.h | 29 ++ .../MpiDefs.h => MPI/DummyDefs.h} | 17 +- .../{Communicators/MPIPrint.h => MPI/Print.h} | 55 ++- .../ScopedInitializer.h | 15 +- src/TNL/MPI/Utils.h | 46 +++ src/TNL/MPI/Wrappers.h | 347 ++++++++++++++++++ src/TNL/MPI/getDataType.h | 119 ++++++ src/TNL/MPI/selectGPU.h | 72 ++++ .../DistributedMeshes/BufferEntitiesHelper.h | 1 - .../DistributedGridIO_MeshFunction.h | 71 ++-- .../DistributedGridSynchronizer.h | 1 - src/TNL/Solvers/Solver_impl.h | 6 +- src/Tools/tnl-game-of-life.cpp | 4 +- src/Tools/tnl-init.cpp | 4 +- src/Tools/tnl-test-distributed-mesh.h | 4 +- .../DistributedNDArrayOverlaps_1D_test.h | 1 - .../DistributedNDArrayOverlaps_semi1D_test.h | 1 - .../ndarray/DistributedNDArray_1D_test.h | 1 - .../ndarray/DistributedNDArray_semi1D_test.h | 1 - .../DistributedMeshes/DistributedMeshTest.h | 1 - src/UnitTests/main_mpi.h | 4 +- 29 files changed, 740 insertions(+), 464 deletions(-) delete mode 100644 src/TNL/Communicators/MPITypeResolver.h create mode 100644 src/TNL/MPI.h rename src/TNL/{Communicators/MpiDefs.h => MPI/DummyDefs.h} (64%) rename src/TNL/{Communicators/MPIPrint.h => MPI/Print.h} (75%) rename src/TNL/{Communicators => MPI}/ScopedInitializer.h (72%) create mode 100644 src/TNL/MPI/Utils.h create mode 100644 src/TNL/MPI/Wrappers.h create mode 100644 src/TNL/MPI/getDataType.h create mode 100644 src/TNL/MPI/selectGPU.h diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index 74a3205d3..abe08210d 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -309,7 +309,7 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 06ba2bc94..75b1e0e25 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -592,7 +592,7 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h index aa4370c7a..fcaaaedf2 100644 --- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h +++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -63,7 +63,7 @@ benchmarkODESolvers( Benchmark& benchmark, #ifdef HAVE_CUDA CudaVectorPointer cuda_u( dofs ); *cuda_u = 0.0; -#endif +#endif if( solver == "euler" || solver == "all" ) { using HostSolver = Solvers::ODE::Euler< HostProblem, SolverMonitorType >; benchmark.setOperation("Euler"); @@ -168,10 +168,10 @@ bool resolveRealTypes( Benchmark& benchmark, Config::ParameterContainer& parameters ) { const String& realType = parameters.getParameter< String >( "real-type" ); - if( ( realType == "float" || realType == "all" ) && + if( ( realType == "float" || realType == "all" ) && ! resolveIndexType< float >( benchmark, metadata, parameters ) ) return false; - if( ( realType == "double" || realType == "all" ) && + if( ( realType == "double" || realType == "all" ) && ! resolveIndexType< double >( benchmark, metadata, parameters ) ) return false; return true; @@ -225,7 +225,7 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) diff --git a/src/TNL/Communicators/MPITypeResolver.h b/src/TNL/Communicators/MPITypeResolver.h deleted file mode 100644 index 5429d5e33..000000000 --- a/src/TNL/Communicators/MPITypeResolver.h +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** - MPITypeResolver.h - description - ------------------- - begin : Feb 4, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -namespace TNL { -namespace Communicators { - -#ifdef HAVE_MPI -template -struct MPITypeResolver -{ - static inline MPI_Datatype getType() - { - static_assert( sizeof(Type) == sizeof(char) || - sizeof(Type) == sizeof(int) || - sizeof(Type) == sizeof(short int) || - sizeof(Type) == sizeof(long int), - "Fatal Error - Unknown MPI Type"); - switch( sizeof( Type ) ) - { - case sizeof( char ): - return MPI_CHAR; - case sizeof( int ): - return MPI_INT; - case sizeof( short int ): - return MPI_SHORT; - case sizeof( long int ): - return MPI_LONG; - } - // this will never happen thanks to the static_assert above, but icpc is not that smart - // and complains about missing return statement at the end of non-void function - throw 0; - } -}; - -template<> struct MPITypeResolver< char > -{ - static inline MPI_Datatype getType(){return MPI_CHAR;}; -}; - -template<> struct MPITypeResolver< int > -{ - static inline MPI_Datatype getType(){return MPI_INT;}; -}; - -template<> struct MPITypeResolver< short int > -{ - static inline MPI_Datatype getType(){return MPI_SHORT;}; -}; - -template<> struct MPITypeResolver< long int > -{ - static inline MPI_Datatype getType(){return MPI_LONG;}; -}; - -template<> struct MPITypeResolver< unsigned char > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;}; -}; - -template<> struct MPITypeResolver< unsigned short int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;}; -}; - -template<> struct MPITypeResolver< unsigned int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED;}; -}; - -template<> struct MPITypeResolver< unsigned long int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;}; -}; - -template<> struct MPITypeResolver< float > -{ - static inline MPI_Datatype getType(){return MPI_FLOAT;}; -}; - -template<> struct MPITypeResolver< double > -{ - static inline MPI_Datatype getType(){return MPI_DOUBLE;}; -}; - -template<> struct MPITypeResolver< long double > -{ - static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;}; -}; - -template<> struct MPITypeResolver< bool > -{ - // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859 - static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." ); - static inline MPI_Datatype getType() { return MPI_C_BOOL; }; -}; -#endif - -} // namespace Communicators -} // namespace TNL diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index dedc35f03..1995978c5 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -11,36 +11,23 @@ #pragma once #include -#include -#include #ifdef HAVE_MPI -#include #ifdef OMPI_MAJOR_VERSION // header specific to OpenMPI (needed for CUDA-aware detection) #include #endif #include // getpid - -#ifdef HAVE_CUDA - #include - - typedef struct __attribute__((__packed__)) { - char name[MPI_MAX_PROCESSOR_NAME]; - } procName; -#endif - #endif #include #include -#include -#include +#include +#include +#include #include -#include #include -#include namespace TNL { @@ -88,7 +75,7 @@ class MpiCommunicator const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" ); if( redirect ) - setupRedirection( outputDirectory ); + MPI::setupRedirection( outputDirectory ); #ifdef HAVE_CUDA int size; MPI_Comm_size( MPI_COMM_WORLD, &size ); @@ -144,125 +131,32 @@ class MpiCommunicator static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { -#ifdef HAVE_MPI - switch( required_thread_level ) { - case MPI_THREAD_SINGLE: - case MPI_THREAD_FUNNELED: - case MPI_THREAD_SERIALIZED: - case MPI_THREAD_MULTIPLE: - break; - default: - printf("ERROR: invalid argument for the 'required' thread level support: %d\n", required_thread_level); - MPI_Abort(MPI_COMM_WORLD, 1); - } - - int provided; - MPI_Init_thread( &argc, &argv, required_thread_level, &provided ); - if( provided < required_thread_level ) { - const char* level = ""; - switch( required_thread_level ) { - case MPI_THREAD_SINGLE: - level = "MPI_THREAD_SINGLE"; - break; - case MPI_THREAD_FUNNELED: - level = "MPI_THREAD_FUNNELED"; - break; - case MPI_THREAD_SERIALIZED: - level = "MPI_THREAD_SERIALIZED"; - break; - case MPI_THREAD_MULTIPLE: - level = "MPI_THREAD_MULTIPLE"; - break; - } - printf("ERROR: The MPI library does not have the required level of thread support: %s\n", level); - MPI_Abort(MPI_COMM_WORLD, 1); - } - - selectGPU(); -#endif + MPI::Init( argc, argv, required_thread_level ); // silence warnings about (potentially) unused variables (void) NullGroup; - (void) NullRequest; - } - - static void setupRedirection( std::string outputDirectory ) - { -#ifdef HAVE_MPI - if(isDistributed() ) - { - if(GetRank(AllGroup)!=0) - { - const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank(AllGroup)) + ".txt"; - const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank(AllGroup)) + ".txt"; - std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl; - Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); - } - } -#endif } static void Finalize() { -#ifdef HAVE_MPI - if(isDistributed()) - { - if(GetRank(AllGroup)!=0) - { - // restore redirection (not necessary, it uses RAII internally...) - Debugging::redirect_stdout_stderr( "", "", true ); - } - } - MPI_Finalize(); -#endif + MPI::Finalize(); } static bool IsInitialized() { -#ifdef HAVE_MPI - int initialized, finalized; - MPI_Initialized(&initialized); - MPI_Finalized(&finalized); - return initialized && !finalized; -#else - return true; -#endif + return MPI::isInitialized(); } static int GetRank(CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "GetRank cannot be called with NullGroup"); - int rank; - MPI_Comm_rank(group,&rank); - return rank; -#else - return 0; -#endif + return MPI::GetRank( group ); } static int GetSize(CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "GetSize cannot be called with NullGroup"); - int size; - MPI_Comm_size(group,&size); - return size; -#else - return 1; -#endif + return MPI::GetSize( group ); } -#ifdef HAVE_MPI - template< typename T > - static MPI_Datatype getDataType( const T& t ) - { - return MPITypeResolver< T >::getType(); - } -#endif - //dim-number of dimensions, distr array of guess distr - 0 for computation //distr array will be filled by computed distribution //more information in MPI documentation @@ -291,78 +185,42 @@ class MpiCommunicator static void Barrier( CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "Barrier cannot be called with NullGroup"); - MPI_Barrier(group); -#endif + MPI::Barrier( group ); } template static void Send( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup"); - MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group ); -#endif + MPI::Send( data, count, dest, tag, group ); } template static void Recv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup"); - MPI_Status status; - MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status ); -#endif - } + MPI::Recv( data, count, src, tag, group ); + } template static Request ISend( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "ISend cannot be called with NullGroup"); - Request req; - MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req); - return req; -#else - return 1; -#endif + return MPI::Isend( data, count, dest, tag, group ); } template static Request IRecv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "IRecv cannot be called with NullGroup"); - Request req; - MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req); - return req; -#else - return 1; -#endif + return MPI::Irecv( data, count, src, tag, group ); } static void WaitAll(Request *reqs, int length) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE); -#endif + MPI::Waitall( reqs, length ); } template< typename T > static void Bcast( T* data, int count, int root, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup"); - MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group); -#endif + MPI::Bcast( data, count, root, group ); } template< typename T > @@ -372,12 +230,7 @@ class MpiCommunicator const MPI_Op &op, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); - MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group); -#else - memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) ); -#endif + MPI::Allreduce( data, reduced_data, count, op, group ); } // in-place variant of Allreduce @@ -387,27 +240,18 @@ class MpiCommunicator const MPI_Op &op, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); - MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group); -#endif + MPI::Allreduce( data, count, op, group ); } - template< typename T > static void Reduce( const T* data, T* reduced_data, int count, - MPI_Op &op, + const MPI_Op &op, int root, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup"); - MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group); -#else - memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) ); -#endif + MPI::Reduce( data, reduced_data, count, op, root, group ); } template< typename T > @@ -421,24 +265,7 @@ class MpiCommunicator int receiveTag, CommunicationGroup group ) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup"); - MPI_Status status; - MPI_Sendrecv( const_cast< void* >( ( void* ) sendData ), - sendCount, - MPITypeResolver< T >::getType(), - destination, - sendTag, - ( void* ) receiveData, - receiveCount, - MPITypeResolver< T >::getType(), - source, - receiveTag, - group, - &status ); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Sendrecv( sendData, sendCount, destination, sendTag, receiveData, receiveCount, source, receiveTag, group ); } template< typename T > @@ -448,19 +275,7 @@ class MpiCommunicator int receiveCount, CommunicationGroup group ) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup"); - MPI_Alltoall( const_cast< void* >( ( void* ) sendData ), - sendCount, - MPITypeResolver< T >::getType(), - ( void* ) receiveData, - receiveCount, - MPITypeResolver< T >::getType(), - group ); -#else - TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." ); - memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) ); -#endif + MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group ); } @@ -485,58 +300,16 @@ class MpiCommunicator } #ifdef HAVE_MPI - static MPI_Request NullRequest; static MPI_Comm AllGroup; static MPI_Comm NullGroup; #else - static constexpr int NullRequest = -1; static constexpr int AllGroup = 1; static constexpr int NullGroup = 0; #endif private: - - static void selectGPU(void) - { -#ifdef HAVE_MPI - #ifdef HAVE_CUDA - const int count = GetSize(AllGroup); - const int rank = GetRank(AllGroup); - int gpuCount; - cudaGetDeviceCount(&gpuCount); - - procName names[count]; - - int i=0; - int len; - MPI_Get_processor_name(names[rank].name, &len); - - for(i=0;i -#include // important only when MPI is disabled namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h index 1cef0873d..2695ccccc 100644 --- a/src/TNL/Containers/Expressions/DistributedComparison.h +++ b/src/TNL/Containers/Expressions/DistributedComparison.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h index b525e8a53..f55ae3d4a 100644 --- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h +++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h new file mode 100644 index 000000000..b1b7dd698 --- /dev/null +++ b/src/TNL/MPI.h @@ -0,0 +1,29 @@ +/*************************************************************************** + MPI.h - description + ------------------- + begin : Dec 29, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +/** + * \brief A convenient header file which includes all headers from the + * `TNL/MPI/` subdirectory. + * + * Users may use this to avoid having to include many header files in their + * projects. On the other hand, parts of the TNL library should generally + * include only the specific headers they need, in order to avoid cycles in + * the header inclusion. + */ + +#include "MPI/DummyDefs.h" +#include "MPI/getDataType.h" +#include "MPI/selectGPU.h" +#include "MPI/Wrappers.h" +#include "MPI/Utils.h" +#include "MPI/ScopedInitializer.h" +#include "MPI/Print.h" diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/MPI/DummyDefs.h similarity index 64% rename from src/TNL/Communicators/MpiDefs.h rename to src/TNL/MPI/DummyDefs.h index df43005ec..cdd5ea483 100644 --- a/src/TNL/Communicators/MpiDefs.h +++ b/src/TNL/MPI/DummyDefs.h @@ -1,8 +1,8 @@ /*************************************************************************** - MpiCommunicator.h - description + MPI/DummyDefs.h - description ------------------- - begin : 2005/04/23 - copyright : (C) 2005 by Tomas Oberhuber + begin : Dec 29, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -11,6 +11,9 @@ #pragma once #ifndef HAVE_MPI +using MPI_Request = int; +using MPI_Comm = int; + enum MPI_Op { MPI_MAX, MPI_MIN, @@ -28,9 +31,9 @@ enum MPI_Op { // MPI_Init_thread constants enum { - MPI_THREAD_SINGLE, - MPI_THREAD_FUNNELED, - MPI_THREAD_SERIALIZED, - MPI_THREAD_MULTIPLE + MPI_THREAD_SINGLE, + MPI_THREAD_FUNNELED, + MPI_THREAD_SERIALIZED, + MPI_THREAD_MULTIPLE }; #endif diff --git a/src/TNL/Communicators/MPIPrint.h b/src/TNL/MPI/Print.h similarity index 75% rename from src/TNL/Communicators/MPIPrint.h rename to src/TNL/MPI/Print.h index 6d78eafaf..5cd4819a2 100644 --- a/src/TNL/Communicators/MPIPrint.h +++ b/src/TNL/MPI/Print.h @@ -1,8 +1,8 @@ /*************************************************************************** - MPIPrint.h - description + MPI/Print.h - description ------------------- begin : Feb 7, 2019 - copyright : (C) 2019 by Tomas Oberhuber + copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -10,34 +10,35 @@ #pragma once +#include #include -#include + +#include +#include #ifdef HAVE_MPI #define TNL_MPI_PRINT( message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ std::cerr << message << std::endl; \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() > 0 ) \ + if( TNL::MPI::GetRank() > 0 ) \ { \ std::stringstream __tnl_mpi_print_stream_; \ - __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of " \ - << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ + __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : " \ + << message << std::endl; \ TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() ); \ mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() ); \ } \ else \ { \ - std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ - for( int __tnl_mpi_print_j = 1; \ - __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize(); \ - __tnl_mpi_print_j++ ) \ - { \ - TNL::String __tnl_mpi_print_string_; \ - mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() ); \ - std::cerr << __tnl_mpi_print_string_; \ - } \ + std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl; \ + for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ ) \ + { \ + TNL::String __tnl_mpi_print_string_; \ + mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() ); \ + std::cerr << __tnl_mpi_print_string_; \ + } \ } \ } #else @@ -47,11 +48,11 @@ else #ifdef HAVE_MPI #define TNL_MPI_PRINT_MASTER( message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ std::cerr << message << std::endl; \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() == 0 ) \ + if( TNL::MPI::GetRank() == 0 ) \ { \ std::cerr << "Master node : " << message << std::endl; \ } \ @@ -63,20 +64,20 @@ else #ifdef HAVE_MPI #define TNL_MPI_PRINT_COND( condition, message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ { \ if( condition) std::cerr << message << std::endl; \ } \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() > 0 ) \ + if( TNL::MPI::GetRank() > 0 ) \ { \ int __tnl_mpi_print_cnd = ( condition ); \ - TNL::Communicators::MpiCommunicator::Send( &__tnl_mpi_print_cnd, 1, 0, 0 ); \ + TNL::MPI::Send( &__tnl_mpi_print_cnd, 1, 0, 0 ); \ if( condition ) { \ std::stringstream __tnl_mpi_print_stream_; \ - __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of " \ - << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ + __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : " \ + << message << std::endl; \ TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() ); \ mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() ); \ } \ @@ -84,13 +85,11 @@ else else \ { \ if( condition ) \ - std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ - for( int __tnl_mpi_print_j = 1; \ - __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize(); \ - __tnl_mpi_print_j++ ) \ + std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl; \ + for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ ) \ { \ int __tnl_mpi_print_cond; \ - TNL::Communicators::MpiCommunicator::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 ); \ + TNL::MPI::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 ); \ if( __tnl_mpi_print_cond ) \ { \ TNL::String __tnl_mpi_print_string_; \ diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h similarity index 72% rename from src/TNL/Communicators/ScopedInitializer.h rename to src/TNL/MPI/ScopedInitializer.h index 2970bc628..82ba02bc5 100644 --- a/src/TNL/Communicators/ScopedInitializer.h +++ b/src/TNL/MPI/ScopedInitializer.h @@ -12,22 +12,25 @@ #pragma once +#include "Wrappers.h" +#include "Utils.h" + namespace TNL { -namespace Communicators { +namespace MPI { -template< typename Communicator > struct ScopedInitializer { - ScopedInitializer( int& argc, char**& argv ) + ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { - Communicator::Init( argc, argv ); + Init( argc, argv ); } ~ScopedInitializer() { - Communicator::Finalize(); + restoreRedirection(); + Finalize(); } }; -} // namespace Communicators +} // namespace MPI } // namespace TNL diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h new file mode 100644 index 000000000..b655aefd0 --- /dev/null +++ b/src/TNL/MPI/Utils.h @@ -0,0 +1,46 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +#include "Wrappers.h" + +namespace TNL { +namespace MPI { + +inline bool isInitialized() +{ + return Initialized() && ! Finalized(); +} + +inline void setupRedirection( std::string outputDirectory ) +{ +#ifdef HAVE_MPI + if( GetSize() > 1 && GetRank() != 0 ) { + const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank()) + ".txt"; + const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank()) + ".txt"; + std::cout << GetRank() << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl; + Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); + } +#endif +} + +// restore redirection (usually not necessary, it uses RAII internally...) +inline void restoreRedirection() +{ + if( GetSize() > 1 && GetRank() != 0 ) { + Debugging::redirect_stdout_stderr( "", "", true ); + } +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h new file mode 100644 index 000000000..9a057da5f --- /dev/null +++ b/src/TNL/MPI/Wrappers.h @@ -0,0 +1,347 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +#ifdef HAVE_MPI + #include +#else + #include "DummyDefs.h" + #include // std::memcpy + #include +#endif + +#include +#include "getDataType.h" +#include "selectGPU.h" + +namespace TNL { +namespace MPI { + +// function wrappers for MPI constants + +inline MPI_Comm AllGroup() +{ +#ifdef HAVE_MPI + return MPI_COMM_WORLD; +#else + return 1; +#endif +} + +inline MPI_Comm NullGroup() +{ +#ifdef HAVE_MPI + return MPI_COMM_NULL; +#else + return 0; +#endif +} + +inline MPI_Request NullRequest() +{ +#ifdef HAVE_MPI + return MPI_REQUEST_NULL; +#else + return 0; +#endif +} + +// wrappers for basic MPI functions + +inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) +{ +#ifdef HAVE_MPI + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + case MPI_THREAD_FUNNELED: + case MPI_THREAD_SERIALIZED: + case MPI_THREAD_MULTIPLE: + break; + default: + std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int provided; + MPI_Init_thread( &argc, &argv, required_thread_level, &provided ); + if( provided < required_thread_level ) { + const char* level = ""; + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + level = "MPI_THREAD_SINGLE"; + break; + case MPI_THREAD_FUNNELED: + level = "MPI_THREAD_FUNNELED"; + break; + case MPI_THREAD_SERIALIZED: + level = "MPI_THREAD_SERIALIZED"; + break; + case MPI_THREAD_MULTIPLE: + level = "MPI_THREAD_MULTIPLE"; + break; + } + std::cerr << "ERROR: The MPI library does not have the required level of thread support: " << level << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + selectGPU(); +#endif +} + +inline void Finalize() +{ +#ifdef HAVE_MPI + MPI_Finalize(); +#endif +} + +inline bool Initialized() +{ +#ifdef HAVE_MPI + int flag; + MPI_Initialized(&flag); + return flag; +#else + return true; +#endif +} + +inline bool Finalized() +{ +#ifdef HAVE_MPI + int flag; + MPI_Finalized(&flag); + return flag; +#else + return false; +#endif +} + +inline int GetRank( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "GetRank cannot be called with NullGroup" ); + int rank; + MPI_Comm_rank( group, &rank ); + return rank; +#else + return 0; +#endif +} + +inline int GetSize( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "GetSize cannot be called with NullGroup" ); + int size; + MPI_Comm_size( group, &size ); + return size; +#else + return 1; +#endif +} + +// wrappers for MPI communication functions + +inline void Barrier( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Barrier cannot be called with NullGroup" ); + MPI_Barrier(group); +#endif +} + +inline void Waitall( MPI_Request* reqs, int length ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + MPI_Waitall( length, reqs, MPI_STATUSES_IGNORE ); +#endif +} + +template< typename T > +void Send( const T* data, + int count, + int dest, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Send cannot be called with NullGroup" ); + MPI_Send( (const void*) data, count, getDataType(), dest, tag, group ); +#endif +} + +template< typename T > +void Recv( T* data, + int count, + int src, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Recv cannot be called with NullGroup" ); + MPI_Recv( (void*) data, count, getDataType(), src, tag, group, MPI_STATUS_IGNORE ); +#endif +} + +template< typename T > +void Sendrecv( const T* sendData, + int sendCount, + int destination, + int sendTag, + T* receiveData, + int receiveCount, + int source, + int receiveTag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Sendrecv cannot be called with NullGroup" ); + MPI_Sendrecv( (void*) sendData, + sendCount, + getDataType(), + destination, + sendTag, + (void*) receiveData, + receiveCount, + getDataType(), + source, + receiveTag, + group, + MPI_STATUS_IGNORE ); +#else + throw Exceptions::MPISupportMissing(); +#endif +} + +template< typename T > +MPI_Request Isend( const T* data, + int count, + int dest, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Isend cannot be called with NullGroup" ); + MPI_Request req; + MPI_Isend( (const void*) data, count, getDataType(), dest, tag, group, &req ); + return req; +#else + return NullRequest(); +#endif +} + +template< typename T > +MPI_Request Irecv( T* data, + int count, + int src, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Irecv cannot be called with NullGroup" ); + MPI_Request req; + MPI_Irecv( (void*) data, count, getDataType(), src, tag, group, &req ); + return req; +#else + return NullRequest(); +#endif +} + +template< typename T > +void Allreduce( const T* data, + T* reduced_data, + int count, + const MPI_Op& op, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType(), op, group ); +#else + std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) ); +#endif +} + +// in-place variant of Allreduce +template< typename T > +void Allreduce( T* data, + int count, + const MPI_Op& op, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType(), op, group ); +#endif +} + +template< typename T > +void Reduce( const T* data, + T* reduced_data, + int count, + const MPI_Op& op, + int root, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Reduce cannot be called with NullGroup" ); + MPI_Reduce( (const void*) data, (void*) reduced_data, count, getDataType(), op, root, group ); +#else + std::memcpy( (void*) reduced_data, (void*) data, count * sizeof(T) ); +#endif +} + +template< typename T > +void Bcast( T* data, int count, int root, MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Bcast cannot be called with NullGroup" ); + MPI_Bcast( (void*) data, count, getDataType(), root, group ); +#endif +} + +template< typename T > +void Alltoall( const T* sendData, + int sendCount, + T* receiveData, + int receiveCount, + MPI_Comm group ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Alltoall cannot be called with NullGroup" ); + MPI_Alltoall( (const void*) sendData, + sendCount, + getDataType(), + (void*) receiveData, + receiveCount, + getDataType(), + group ); +#else + TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." ); + std::memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof(T) ); +#endif +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/getDataType.h b/src/TNL/MPI/getDataType.h new file mode 100644 index 000000000..f3570679b --- /dev/null +++ b/src/TNL/MPI/getDataType.h @@ -0,0 +1,119 @@ +/*************************************************************************** + getDataType.h - description + ------------------- + begin : Feb 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#ifdef HAVE_MPI + #include +#endif + +namespace TNL { +namespace MPI { + +#ifdef HAVE_MPI +template< typename T > +struct TypeResolver +{ + static inline MPI_Datatype getType() + { + static_assert( sizeof(T) == sizeof(char) || + sizeof(T) == sizeof(int) || + sizeof(T) == sizeof(short int) || + sizeof(T) == sizeof(long int), + "Fatal Error - Unknown MPI Type"); + switch( sizeof(T) ) + { + case sizeof(char): + return MPI_CHAR; + case sizeof(int): + return MPI_INT; + case sizeof(short int): + return MPI_SHORT; + case sizeof(long int): + return MPI_LONG; + } + // This will never happen thanks to the static_assert above, but icpc is + // not that smart and complains about missing return statement at the end + // of non-void function. + throw 0; + } +}; + +template<> struct TypeResolver< char > +{ + static inline MPI_Datatype getType(){return MPI_CHAR;}; +}; + +template<> struct TypeResolver< int > +{ + static inline MPI_Datatype getType(){return MPI_INT;}; +}; + +template<> struct TypeResolver< short int > +{ + static inline MPI_Datatype getType(){return MPI_SHORT;}; +}; + +template<> struct TypeResolver< long int > +{ + static inline MPI_Datatype getType(){return MPI_LONG;}; +}; + +template<> struct TypeResolver< unsigned char > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;}; +}; + +template<> struct TypeResolver< unsigned short int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;}; +}; + +template<> struct TypeResolver< unsigned int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED;}; +}; + +template<> struct TypeResolver< unsigned long int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;}; +}; + +template<> struct TypeResolver< float > +{ + static inline MPI_Datatype getType(){return MPI_FLOAT;}; +}; + +template<> struct TypeResolver< double > +{ + static inline MPI_Datatype getType(){return MPI_DOUBLE;}; +}; + +template<> struct TypeResolver< long double > +{ + static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;}; +}; + +template<> struct TypeResolver< bool > +{ + // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859 + static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." ); + static inline MPI_Datatype getType() { return MPI_C_BOOL; }; +}; + +template< typename T > +MPI_Datatype getDataType( const T& = T{} ) +{ + return TypeResolver< T >::getType(); +} +#endif + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h new file mode 100644 index 000000000..def9a329f --- /dev/null +++ b/src/TNL/MPI/selectGPU.h @@ -0,0 +1,72 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +#include + +namespace TNL { +namespace MPI { +namespace { + +#ifdef HAVE_MPI +#ifdef HAVE_CUDA + typedef struct __attribute__((__packed__)) { + char name[MPI_MAX_PROCESSOR_NAME]; + } procName; +#endif +#endif + +inline void selectGPU() +{ +#ifdef HAVE_MPI +#ifdef HAVE_CUDA + int size; + MPI_Comm_size( MPI_COMM_WORLD, &size ); + int rank; + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + int gpuCount; + cudaGetDeviceCount( &gpuCount ); + + procName names[size]; + + int i=0; + int len; + MPI_Get_processor_name(names[rank].name, &len); + + for(i=0;i +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h index 6030b976f..04647cb4a 100644 --- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h @@ -12,7 +12,6 @@ #include #include -#include namespace TNL { namespace Meshes { diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h index 60605c6eb..99f505bba 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h @@ -12,6 +12,7 @@ #include #include +#include namespace TNL { namespace Meshes { @@ -19,7 +20,7 @@ namespace DistributedMeshes { /* - * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. + * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. * It is slow and has high RAM consumption */ template< typename MeshFunction, @@ -88,8 +89,8 @@ class DistributedGridIO< return true; }; - - static bool load(const String& fileName,MeshFunctionType &meshFunction) + + static bool load(const String& fileName,MeshFunctionType &meshFunction) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); if(distrGrid==NULL) //not distributed @@ -99,10 +100,10 @@ class DistributedGridIO< } const MeshType& mesh=meshFunction.getMesh(); - + PointType spaceSteps=mesh.getSpaceSteps(); PointType origin=mesh.getOrigin(); - + CoordinatesType localSize=distrGrid->getLocalSize(); CoordinatesType localBegin=distrGrid->getLocalBegin(); @@ -111,33 +112,33 @@ class DistributedGridIO< newMesh->setSpaceSteps(spaceSteps); CoordinatesType newOrigin; newMesh->setOrigin(origin+spaceSteps*localBegin); - + VectorType newDof(newMesh-> template getEntitiesCount< typename MeshType::Cell >()); MeshFunctionType newMeshFunction; - newMeshFunction.bind(newMesh,newDof); + newMeshFunction.bind(newMesh,newDof); CoordinatesType zeroCoord; - zeroCoord.setValue(0); + zeroCoord.setValue(0); File file; file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), std::ios_base::in ); newMeshFunction.boundLoad(file); file.close(); CopyEntitiesHelper::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize); - + return true; }; - + }; /* - * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, + * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, * EXPLOSIVE: works with only Grids and MPI * BAD IMPLEMENTTION creating MPI-Types at every save! -- I dont want contamine more places by MPI.. */ #ifdef HAVE_MPI -template +template class DistributedGridIO_MPIIOBase { public: @@ -152,7 +153,7 @@ class DistributedGridIO_MPIIOBase static bool save(const String& fileName, MeshFunctionType &meshFunction, RealType *data) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - + if(distrGrid==NULL) //not distributed { meshFunction.save(fileName); @@ -168,7 +169,7 @@ class DistributedGridIO_MPIIOBase &file); if( ok != 0 ) throw std::runtime_error("Open file falied"); - + int written=save(file,meshFunction, data,0); MPI_File_close(&file); @@ -176,7 +177,7 @@ class DistributedGridIO_MPIIOBase return written>0; }; - + static int save(MPI_File &file, MeshFunctionType &meshFunction, RealType *data, int offset) { @@ -187,7 +188,7 @@ class DistributedGridIO_MPIIOBase int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); int headerSize; - + MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL); if(Communicators::MpiCommunicator::GetRank(group)==0) @@ -200,9 +201,9 @@ class DistributedGridIO_MPIIOBase offset +=headerSize; MPI_File_set_view(file,offset, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype,"native",MPI_INFO_NULL); - + MPI_Status wstatus; MPI_File_write(file,data,1,atype,&wstatus); @@ -222,7 +223,7 @@ class DistributedGridIO_MPIIOBase int fstarts[dim]; int flsize[dim]; int fgsize[dim]; - + hackArray(dim,fstarts,distrGrid->getGlobalBegin().getData()); hackArray(dim,flsize,distrGrid->getLocalSize().getData()); hackArray(dim,fgsize,distrGrid->getGlobalSize().getData()); @@ -230,14 +231,14 @@ class DistributedGridIO_MPIIOBase MPI_Type_create_subarray(dim, fgsize,flsize,fstarts, MPI_ORDER_C, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype); MPI_Type_commit(ftype); int agsize[dim]; int alsize[dim]; - int astarts[dim]; + int astarts[dim]; hackArray(dim,astarts,distrGrid->getLocalBegin().getData()); hackArray(dim,alsize,distrGrid->getLocalSize().getData()); @@ -246,7 +247,7 @@ class DistributedGridIO_MPIIOBase MPI_Type_create_subarray(dim, agsize,alsize,astarts, MPI_ORDER_C, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), atype); MPI_Type_commit(atype); @@ -350,9 +351,9 @@ class DistributedGridIO_MPIIOBase MPI_File_close(&file); return ret; } - + /* Funky bomb - no checks - only dirty load */ - static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) + static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); @@ -360,7 +361,7 @@ class DistributedGridIO_MPIIOBase MPI_Datatype ftype; MPI_Datatype atype; int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); - + MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL); int headerSize=0; @@ -371,18 +372,18 @@ class DistributedGridIO_MPIIOBase headerSize=readMeshFunctionHeader(file,meshFunction,dataCount); } MPI_Bcast(&headerSize, 1, MPI_INT,0, group); - + if(headerSize<0) return false; offset+=headerSize; MPI_File_set_view(file,offset, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype,"native",MPI_INFO_NULL); MPI_Status wstatus; MPI_File_read(file,(void*)data,1,atype,&wstatus); - + MPI_Type_free(&atype); MPI_Type_free(&ftype); @@ -412,7 +413,7 @@ class DistributedGridIO_MPIIOBase size+=count*sizeof(char); MPI_File_read(file, (void *)&count,1, MPI_INT, &rstatus);//DATACOUNT size+=1*sizeof(int); - + if(count!=length) { std::cerr<<"Chyba načítání MeshFunction, délka dat v souboru neodpovídá očekávané délce" << std::endl; @@ -421,7 +422,7 @@ class DistributedGridIO_MPIIOBase return size; }; - + }; #endif @@ -444,10 +445,10 @@ class DistributedGridIO< #ifdef HAVE_MPI if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed { - using HostVectorType = Containers::Vector; + using HostVectorType = Containers::Vector; HostVectorType hostVector; hostVector=meshFunction.getData(); - typename MeshFunctionType::RealType * data=hostVector.getData(); + typename MeshFunctionType::RealType * data=hostVector.getData(); return DistributedGridIO_MPIIOBase::save(fileName,meshFunction,data); } #endif @@ -455,12 +456,12 @@ class DistributedGridIO< return false; }; - static bool load(const String& fileName,MeshFunctionType &meshFunction) + static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed { - using HostVectorType = Containers::Vector; + using HostVectorType = Containers::Vector; HostVectorType hostVector; hostVector.setLike(meshFunction.getData()); auto* data=hostVector.getData(); @@ -501,7 +502,7 @@ class DistributedGridIO< return false; }; - static bool load(const String& fileName,MeshFunctionType &meshFunction) + static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h index 5a1150240..7bc17f920 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h @@ -16,7 +16,6 @@ #include #include #include -#include #include namespace TNL { diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h index 9182c620f..5c35c7c33 100644 --- a/src/TNL/Solvers/Solver_impl.h +++ b/src/TNL/Solvers/Solver_impl.h @@ -16,11 +16,11 @@ #include #include #include -#include +#include namespace TNL { namespace Solvers { - + template< template< typename Real, typename Device, typename Index, typename MeshType, typename MeshConfig, typename SolverStarter, typename CommunicatorType > class ProblemSetter, template< typename MeshConfig > class ProblemConfig, typename MeshConfig > @@ -37,7 +37,7 @@ run( int argc, char* argv[] ) Devices::Cuda::configSetup( configDescription ); Communicators::MpiCommunicator::configSetup( configDescription ); - Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv ); + TNL::MPI::ScopedInitializer mpi( argc, argv ); if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return false; diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp index c33ae8294..a2d4f48e9 100644 --- a/src/Tools/tnl-game-of-life.cpp +++ b/src/Tools/tnl-game-of-life.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include using namespace TNL; @@ -361,7 +361,7 @@ int main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp index 1a7769b5c..73765aafb 100644 --- a/src/Tools/tnl-init.cpp +++ b/src/Tools/tnl-init.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include using namespace TNL; @@ -55,7 +55,7 @@ int main( int argc, char* argv[] ) setupConfig( configDescription ); Communicators::MpiCommunicator::configSetup( configDescription ); - Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return EXIT_FAILURE; diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h index 0be53242b..1b8c59c75 100644 --- a/src/Tools/tnl-test-distributed-mesh.h +++ b/src/Tools/tnl-test-distributed-mesh.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include using namespace TNL; @@ -431,7 +431,7 @@ int main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h index 113d1daa3..366535cc7 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h index 145b0db5b..aba9420f0 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h index d80e467f5..3c637de4d 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h index a072b2e80..93d6c3036 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h index 7decaf575..b778937b6 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h index 0f8f4b059..4c89b60ba 100644 --- a/src/UnitTests/main_mpi.h +++ b/src/UnitTests/main_mpi.h @@ -7,7 +7,7 @@ #if (defined(HAVE_GTEST) && defined(HAVE_MPI)) #include -#include +#include using CommunicatorType = TNL::Communicators::MpiCommunicator; #include @@ -58,7 +58,7 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - TNL::Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); #endif return RUN_ALL_TESTS(); #else -- GitLab From 5375835239bf201a79e637d767e4020ff344eb46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 29 Dec 2020 22:38:03 +0100 Subject: [PATCH 36/50] MPI: added getRankOnNode and removed MPI_Get_processor_name from selectGPU --- Documentation/Pages/main-page.md | 6 ++--- src/TNL/MPI/Utils.h | 30 +++++++++++++++++++++ src/TNL/MPI/Wrappers.h | 7 ++++- src/TNL/MPI/selectGPU.h | 45 ++++---------------------------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/Documentation/Pages/main-page.md b/Documentation/Pages/main-page.md index db9aceccb..5693f92a0 100644 --- a/Documentation/Pages/main-page.md +++ b/Documentation/Pages/main-page.md @@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries. - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for computations on Nvidia GPUs. - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can - use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for - distributed computing. For distributed CUDA computations, the library must - be [CUDA-aware]( + a library implementing the MPI-3 standard for distributed computing (e.g. + [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations, + the library must be [CUDA-aware]( https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/). - __Libraries:__ diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h index b655aefd0..d334aaf5b 100644 --- a/src/TNL/MPI/Utils.h +++ b/src/TNL/MPI/Utils.h @@ -42,5 +42,35 @@ inline void restoreRedirection() } } +/** + * \brief Returns a local rank ID of the current process within a group of + * processes running on a shared-memory node. + * + * The given MPI communicator is split into groups according to the + * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process + * within the group is returned. + */ +inline int getRankOnNode( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + const int rank = GetRank(group); + + MPI_Info info; + MPI_Info_create( &info ); + + MPI_Comm local_comm; + MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm ); + + const int local_rank = GetRank( local_comm ); + + MPI_Comm_free(&local_comm); + MPI_Info_free(&info); + + return local_rank; +#else + return 0; +#endif +} + } // namespace MPI } // namespace TNL diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h index 9a057da5f..5527ad9af 100644 --- a/src/TNL/MPI/Wrappers.h +++ b/src/TNL/MPI/Wrappers.h @@ -22,11 +22,13 @@ #include #include "getDataType.h" -#include "selectGPU.h" namespace TNL { namespace MPI { +// forward declaration to break cyclic inclusion +inline void selectGPU(); + // function wrappers for MPI constants inline MPI_Comm AllGroup() @@ -345,3 +347,6 @@ void Alltoall( const T* sendData, } // namespace MPI } // namespace TNL + +// late inclusion to break cyclic inclusion +#include "selectGPU.h" diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h index def9a329f..781a52809 100644 --- a/src/TNL/MPI/selectGPU.h +++ b/src/TNL/MPI/selectGPU.h @@ -10,63 +10,28 @@ #pragma once -#include - #include +#include "Utils.h" + namespace TNL { namespace MPI { -namespace { - -#ifdef HAVE_MPI -#ifdef HAVE_CUDA - typedef struct __attribute__((__packed__)) { - char name[MPI_MAX_PROCESSOR_NAME]; - } procName; -#endif -#endif inline void selectGPU() { #ifdef HAVE_MPI #ifdef HAVE_CUDA - int size; - MPI_Comm_size( MPI_COMM_WORLD, &size ); - int rank; - MPI_Comm_rank( MPI_COMM_WORLD, &rank ); int gpuCount; - cudaGetDeviceCount( &gpuCount ); + cudaGetDeviceCount(&gpuCount); - procName names[size]; - - int i=0; - int len; - MPI_Get_processor_name(names[rank].name, &len); - - for(i=0;i } // namespace MPI } // namespace TNL -- GitLab From 3ef7f564fd8bba50d94f9f0fd12d55bcbac947c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 31 Dec 2020 23:46:38 +0100 Subject: [PATCH 37/50] MPI refactoring: removed writeProlog from MpiCommunicator --- src/TNL/Communicators/MpiCommunicator.h | 9 ------- src/TNL/Solvers/PDE/PDESolver.h | 25 +++++++++---------- src/TNL/Solvers/PDE/PDESolver_impl.h | 19 +++++++------- .../Solvers/PDE/TimeDependentPDESolver_impl.h | 2 +- .../PDE/TimeIndependentPDESolver_impl.h | 18 ++++++------- 5 files changed, 32 insertions(+), 41 deletions(-) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 1995978c5..eaf6ca634 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -278,15 +278,6 @@ class MpiCommunicator MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group ); } - - static void writeProlog( Logger& logger ) - { - if( isDistributed() ) - { - logger.writeParameter( "MPI processes:", GetSize(AllGroup) ); - } - } - static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup ) { #ifdef HAVE_MPI diff --git a/src/TNL/Solvers/PDE/PDESolver.h b/src/TNL/Solvers/PDE/PDESolver.h index b9bbcd5e2..70f19d8de 100644 --- a/src/TNL/Solvers/PDE/PDESolver.h +++ b/src/TNL/Solvers/PDE/PDESolver.h @@ -18,8 +18,8 @@ namespace TNL { namespace Solvers { -namespace PDE { - +namespace PDE { + template< typename Real, typename Index > class PDESolver @@ -28,8 +28,8 @@ class PDESolver using RealType = Real; using IndexType = Index; using SolverMonitorType = IterativeSolverMonitor< RealType, IndexType >; - - + + PDESolver(); static void configSetup( Config::ConfigDescription& config, @@ -38,29 +38,28 @@ class PDESolver bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); - template< typename Communicator > bool writeProlog( Logger& logger, const Config::ParameterContainer& parameters ); - + void setIoTimer( Timer& ioTimer); void setComputeTimer( Timer& computeTimer ); - + void setTotalTimer( Timer& totalTimer ); - + void setSolverMonitor( SolverMonitorType& solverMonitor ); - + SolverMonitorType& getSolverMonitor(); - bool writeEpilog( Logger& logger ) const; - + bool writeEpilog( Logger& logger ) const; + protected: Timer *ioTimer, *computeTimer, *totalTimer; - + SolverMonitorType *solverMonitorPointer; }; - + } // namespace PDE } // namespace Solvers } // namespace TNL diff --git a/src/TNL/Solvers/PDE/PDESolver_impl.h b/src/TNL/Solvers/PDE/PDESolver_impl.h index 37ade9f38..8bdcbd86a 100644 --- a/src/TNL/Solvers/PDE/PDESolver_impl.h +++ b/src/TNL/Solvers/PDE/PDESolver_impl.h @@ -11,21 +11,22 @@ #pragma once #include +#include namespace TNL { namespace Solvers { -namespace PDE { +namespace PDE { template< typename Real, - typename Index > -PDESolver< Real, Index >::PDESolver() + typename Index > +PDESolver< Real, Index >::PDESolver() : ioTimer( 0 ), computeTimer( 0 ), totalTimer( 0 ), solverMonitorPointer( 0 ) { } - + template< typename Real, typename Index > void @@ -65,7 +66,6 @@ getSolverMonitor() template< typename Real, typename Index > - template< typename Communicator > bool PDESolver< Real, Index >:: writeProlog( Logger& logger, @@ -84,7 +84,8 @@ writeProlog( Logger& logger, else logger.writeParameter< String >( "OMP enabled:", "no", 1 ); } - Communicator::writeProlog( logger ); + if( MPI::isInitialized() ) + logger.writeParameter( "MPI processes:", MPI::GetSize() ); logger.writeSeparator(); const bool printGPUs = parameters.getParameter< String >( "device" ) == "cuda"; logger.writeSystemInformation( printGPUs ); @@ -116,9 +117,9 @@ void PDESolver< Real, Index >:: setTotalTimer( Timer& totalTimer ) { this->totalTimer = &totalTimer; -} - +} + } // namespace PDE } // namespace Solvers } // namespace TNL - + diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h index 46ffa6fea..0c605fb95 100644 --- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h @@ -165,7 +165,7 @@ writeProlog( Logger& logger, logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters ); logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters ); logger.writeSeparator(); - return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters ); + return BaseType::writeProlog( logger, parameters ); } template< typename Problem, diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h index 455682e2b..5292e7f41 100644 --- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h @@ -15,7 +15,7 @@ * * ***************************************************************************/ -#pragma once +#pragma once #include #include @@ -23,7 +23,7 @@ namespace TNL { namespace Solvers { -namespace PDE { +namespace PDE { template< typename Problem > @@ -75,7 +75,7 @@ setup( const Config::ParameterContainer& parameters, return false; } problem->setCommonData( this->commonDataPointer ); - + /**** * Setup the problem */ @@ -83,7 +83,7 @@ setup( const Config::ParameterContainer& parameters, { std::cerr << "The problem initiation failed!" << std::endl; return false; - } + } /**** * Set DOFs (degrees of freedom) @@ -91,9 +91,9 @@ setup( const Config::ParameterContainer& parameters, TNL_ASSERT_GT( problem->getDofs(), 0, "number of DOFs must be positive" ); this->dofs->setSize( problem->getDofs() ); this->dofs->setValue( 0.0 ); - this->problem->bindDofs( this->dofs ); - - + this->problem->bindDofs( this->dofs ); + + /*** * Set-up the initial condition */ @@ -102,7 +102,7 @@ setup( const Config::ParameterContainer& parameters, if( ! this->problem->setInitialCondition( parameters, this->dofs ) ) return false; std::cout << " [ OK ]" << std::endl; - + return true; } @@ -128,7 +128,7 @@ writeProlog( Logger& logger, logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters ); logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters ); logger.writeSeparator(); - return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters ); + return BaseType::writeProlog( logger, parameters ); } template< typename Problem > -- GitLab From b8ae1e278a68719bcc273fe6a68920aa176422e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 1 Jan 2021 00:04:07 +0100 Subject: [PATCH 38/50] MPI refactoring: moved setup and configSetup from MpiCommunicator into a separate header --- src/TNL/Communicators/MpiCommunicator.h | 135 ++++++------------------ src/TNL/MPI.h | 1 + src/TNL/MPI/Config.h | 103 ++++++++++++++++++ 3 files changed, 134 insertions(+), 105 deletions(-) create mode 100644 src/TNL/MPI/Config.h diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index eaf6ca634..d3b0401e5 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -10,26 +10,12 @@ #pragma once -#include - -#ifdef HAVE_MPI -#ifdef OMPI_MAJOR_VERSION - // header specific to OpenMPI (needed for CUDA-aware detection) - #include -#endif - -#include // getpid -#endif - -#include -#include #include #include #include -#include +#include #include - namespace TNL { //! \brief Namespace for TNL communicators. namespace Communicators { @@ -58,75 +44,13 @@ class MpiCommunicator static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ) { -#ifdef HAVE_MPI - config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true ); - config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." ); - config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false ); - config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 ); -#endif + MPI::configSetup( config, prefix ); } static bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ) { -#ifdef HAVE_MPI - if(IsInitialized())//i.e. - isUsed - { - const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); - const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" ); - if( redirect ) - MPI::setupRedirection( outputDirectory ); -#ifdef HAVE_CUDA - int size; - MPI_Comm_size( MPI_COMM_WORLD, &size ); - if( size > 1 ) - { - #if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT - std::cout << "CUDA-aware MPI detected on this system ... " << std::endl; - #elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT - std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl; - return false; - #else - std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl; - #endif - } -#endif // HAVE_CUDA - bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" ); - int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" ); - - if( gdbDebug ) - { - int rank = GetRank( MPI_COMM_WORLD ); - int pid = getpid(); - - volatile int tnlMPIDebugAttached = 0; - MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD ); - MPI_Barrier( MPI_COMM_WORLD ); - if( rank == 0 ) - { - std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl; - for( int i = 0; i < GetSize( MPI_COMM_WORLD ); i++ ) - { - MPI_Status status; - int recvPid; - MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status ); - - if( i == processToAttach || processToAttach == -1 ) - { - std::cout << " For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\"" - << " -ex \"set variable tnlMPIDebugAttached=1\"" - << " -ex \"continue\"" << std::endl; - } - } - std::cout << std::flush; - } - if( rank == processToAttach || processToAttach == -1 ) - while( ! tnlMPIDebugAttached ); - MPI_Barrier( MPI_COMM_WORLD ); - } - } -#endif // HAVE_MPI - return true; + return MPI::setup( parameters, prefix ); } static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) @@ -157,32 +81,6 @@ class MpiCommunicator return MPI::GetSize( group ); } - //dim-number of dimensions, distr array of guess distr - 0 for computation - //distr array will be filled by computed distribution - //more information in MPI documentation - static void DimsCreate(int nproc, int dim, int *distr) - { -#ifdef HAVE_MPI - int sum = 0, prod = 1; - for( int i = 0;i < dim; i++ ) { - sum += distr[ i ]; - prod *= distr[ i ]; - } - if( prod != 0 && prod != GetSize( AllGroup ) ) - throw Exceptions::MPIDimsCreateError(); - if(sum==0) { - for(int i=0;i + +#ifdef HAVE_MPI +#ifdef OMPI_MAJOR_VERSION + // header specific to OpenMPI (needed for CUDA-aware detection) + #include +#endif + +#include // getpid +#endif + +#include +#include +#include "Utils.h" + +namespace TNL { +namespace MPI { + +inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" ) +{ +#ifdef HAVE_MPI + config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true ); + config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." ); + config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false ); + config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 ); +#endif +} + +inline bool setup( const Config::ParameterContainer& parameters, + const String& prefix = "" ) +{ +#ifdef HAVE_MPI + if( Initialized() && ! Finalized() ) + { + const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); + const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" ); + if( redirect ) + MPI::setupRedirection( outputDirectory ); +#ifdef HAVE_CUDA + if( GetSize() > 1 ) + { +#if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT + std::cout << "CUDA-aware MPI detected on this system ... " << std::endl; +#elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT + std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl; + return false; +#else + std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl; +#endif + } +#endif // HAVE_CUDA + bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" ); + int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" ); + + if( gdbDebug ) + { + int rank = GetRank( MPI_COMM_WORLD ); + int pid = getpid(); + + volatile int tnlMPIDebugAttached = 0; + MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD ); + MPI_Barrier( MPI_COMM_WORLD ); + if( rank == 0 ) + { + std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl; + for( int i = 0; i < GetSize(); i++ ) + { + MPI_Status status; + int recvPid; + MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status ); + + if( i == processToAttach || processToAttach == -1 ) + { + std::cout << " For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\"" + << " -ex \"set variable tnlMPIDebugAttached=1\"" + << " -ex \"continue\"" << std::endl; + } + } + std::cout << std::flush; + } + if( rank == processToAttach || processToAttach == -1 ) + while( ! tnlMPIDebugAttached ); + MPI_Barrier( MPI_COMM_WORLD ); + } + } +#endif // HAVE_MPI + return true; +} + +} // namespace MPI +} // namespace TNL -- GitLab From 0742d2a2406f4e52d5217df4c0c3944f491ec282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 1 Jan 2021 00:13:00 +0100 Subject: [PATCH 39/50] MPI refactoring: replaced MPIDimsCreateError with std::logic_error --- src/TNL/Communicators/MpiCommunicator.h | 5 +++-- src/TNL/Exceptions/MPIDimsCreateError.h | 28 ------------------------- 2 files changed, 3 insertions(+), 30 deletions(-) delete mode 100644 src/TNL/Exceptions/MPIDimsCreateError.h diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index d3b0401e5..c155cabbe 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -14,7 +14,6 @@ #include #include #include -#include namespace TNL { //! \brief Namespace for TNL communicators. @@ -189,7 +188,9 @@ class MpiCommunicator prod *= distr[ i ]; } if( prod != 0 && prod != GetSize( AllGroup ) ) - throw Exceptions::MPIDimsCreateError(); + throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions." + "Non of the dimensions is zero and product of all dimensions does " + "not fit with number of MPI processes." ); if(sum==0) { for(int i=0;i - -namespace TNL { -namespace Exceptions { - -struct MPIDimsCreateError - : public std::runtime_error -{ - MPIDimsCreateError() - : std::runtime_error( "The program tries to call MPI_Dims_create with wrong dimensions." - "Non of the dimensions is zero and product of all dimensions does not fit with number of MPI processes." ) - {} -}; - -} // namespace Exceptions -} // namespace TNL -- GitLab From eb8b40dcbd55fd2304d02f55ffc71a0c674cb65e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 1 Jan 2021 11:31:33 +0100 Subject: [PATCH 40/50] MPI: added function for timing Allreduce operations --- src/TNL/MPI.h | 1 + src/TNL/MPI/Profiling.h | 25 +++++++++++++++++++++++++ src/TNL/MPI/Wrappers.h | 5 +++++ 3 files changed, 31 insertions(+) create mode 100644 src/TNL/MPI/Profiling.h diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h index 68e0dc48c..a5f9145b5 100644 --- a/src/TNL/MPI.h +++ b/src/TNL/MPI.h @@ -22,6 +22,7 @@ #include "MPI/DummyDefs.h" #include "MPI/getDataType.h" +#include "MPI/Profiling.h" #include "MPI/selectGPU.h" #include "MPI/Wrappers.h" #include "MPI/Utils.h" diff --git a/src/TNL/MPI/Profiling.h b/src/TNL/MPI/Profiling.h new file mode 100644 index 000000000..d50427c16 --- /dev/null +++ b/src/TNL/MPI/Profiling.h @@ -0,0 +1,25 @@ +/*************************************************************************** + MPI/Profiling.h - description + ------------------- + begin : Jan 1, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +namespace TNL { +namespace MPI { + +inline Timer& getTimerAllreduce() +{ + static Timer t; + return t; +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h index 5527ad9af..39344a128 100644 --- a/src/TNL/MPI/Wrappers.h +++ b/src/TNL/MPI/Wrappers.h @@ -22,6 +22,7 @@ #include #include "getDataType.h" +#include "Profiling.h" namespace TNL { namespace MPI { @@ -278,7 +279,9 @@ void Allreduce( const T* data, { #ifdef HAVE_MPI TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + getTimerAllreduce().start(); MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType(), op, group ); + getTimerAllreduce().stop(); #else std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) ); #endif @@ -293,7 +296,9 @@ void Allreduce( T* data, { #ifdef HAVE_MPI TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + getTimerAllreduce().start(); MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType(), op, group ); + getTimerAllreduce().stop(); #endif } -- GitLab From 5e7005a67fd1f93b6d295e01c561f5bd1dae88f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 1 Jan 2021 15:04:06 +0100 Subject: [PATCH 41/50] MPI refactoring: removed MpiCommunicator from solvers: Merson, GMRES, Linear/Traits.h --- src/TNL/Solvers/Linear/GMRES.h | 3 --- src/TNL/Solvers/Linear/GMRES_impl.h | 10 +++++----- src/TNL/Solvers/Linear/Traits.h | 10 +++------- src/TNL/Solvers/ODE/Merson_impl.h | 10 +++++----- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index e1c02f0ab..818f1c163 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -23,10 +23,7 @@ class GMRES : public LinearSolver< Matrix > { using Base = LinearSolver< Matrix >; - - // compatibility shortcuts using Traits = Linear::Traits< Matrix >; - using CommunicatorType = typename Traits::CommunicatorType; public: using RealType = typename Base::RealType; diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 23b563940..3b13e0b28 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -510,7 +510,7 @@ hauseholder_generate( const int i, norm_yi_squared = 2 * (normz * normz + std::fabs( y_ii ) * normz); } // no-op if the problem is not distributed - CommunicatorType::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); // XXX: normalization is slower, but more stable // y_i *= 1.0 / std::sqrt( norm_yi_squared ); @@ -534,7 +534,7 @@ hauseholder_generate( const int i, i, aux ); // no-op if the problem is not distributed - CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { @@ -559,7 +559,7 @@ hauseholder_apply_trunc( HostView out, HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 ); Algorithms::MultiDeviceMemoryOperations< Devices::Host, DeviceType >::copy( YL_i.getData(), Traits::getLocalView( y_i ).getData(), YL_i.getSize() ); // no-op if the problem is not distributed - CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); // NOTE: aux = t_i * (y_i, z) = 1 since t_i = 2 / ||y_i||^2 and // (y_i, z) = ||z_trunc||^2 + |z_i| ||z_trunc|| = ||y_i||^2 / 2 @@ -579,7 +579,7 @@ hauseholder_apply_trunc( HostView out, } // no-op if the problem is not distributed - CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); } template< typename Matrix > @@ -634,7 +634,7 @@ hauseholder_cwy_transposed( VectorViewType z, i + 1, aux ); // no-op if the problem is not distributed - Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 83313ed98..7a1879923 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -12,7 +12,7 @@ #pragma once -#include +#include #include #include #include @@ -26,8 +26,6 @@ namespace Linear { template< typename Matrix > struct Traits { - using CommunicatorType = Communicators::MpiCommunicator; - using VectorType = Containers::Vector < typename Matrix::RealType, typename Matrix::DeviceType, @@ -51,7 +49,7 @@ struct Traits static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v; } static LocalViewType getLocalView( VectorViewType v ) { return v; } - static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } + static MPI_Comm getCommunicationGroup( const Matrix& m ) { return MPI::AllGroup(); } static void startSynchronization( VectorViewType v ) {} static void waitForSynchronization( VectorViewType v ) {} }; @@ -59,8 +57,6 @@ struct Traits template< typename Matrix, typename Communicator > struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > { - using CommunicatorType = Communicator; - using VectorType = Containers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, @@ -96,7 +92,7 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); } static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); } - static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } + static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } static void startSynchronization( VectorViewType v ) { v.startSynchronization(); } static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); } }; diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 82a6a87ff..247318f33 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -13,13 +13,13 @@ #include #include #include -#include +#include #include "Merson.h" namespace TNL { namespace Solvers { -namespace ODE { +namespace ODE { /**** * In this code we do not use constants and references as we would like to. @@ -154,9 +154,9 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) RealType error( 0.0 ); if( adaptivity != 0.0 ) { - const RealType localError = + const RealType localError = max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) ); - Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup ); + MPI::Allreduce( &localError, &error, 1, MPI_MAX, MPI::AllGroup() ); } if( adaptivity == 0.0 || error < adaptivity ) @@ -185,7 +185,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) currentTau = min( currentTau, this->getMaxTau() ); #ifdef USE_MPI TNLMPI::Bcast( currentTau, 1, 0 ); -#endif +#endif } if( time + currentTau > this->getStopTime() ) currentTau = this->getStopTime() - time; //we don't want to keep such tau -- GitLab From 3c5d17e38a10f8a4b306d16c73c462416a39e604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 1 Jan 2021 21:55:37 +0100 Subject: [PATCH 42/50] MPI refactoring: removed MpiCommunicator from DistributedNDArray, added Allocator parameter to NDArray --- src/TNL/Containers/DistributedNDArray.h | 44 ++++++----- .../DistributedNDArraySynchronizer.h | 62 +++++++-------- src/TNL/Containers/DistributedNDArrayView.h | 21 +++-- src/TNL/Containers/NDArray.h | 78 ++++++++++++++++--- src/TNL/Containers/Partitioner.h | 2 +- .../DistributedNDArrayOverlaps_1D_test.h | 16 ++-- .../DistributedNDArrayOverlaps_semi1D_test.h | 16 ++-- .../ndarray/DistributedNDArray_1D_test.h | 20 ++--- .../ndarray/DistributedNDArray_semi1D_test.h | 20 ++--- 9 files changed, 157 insertions(+), 122 deletions(-) diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h index 57b94a34b..c49e9e31b 100644 --- a/src/TNL/Containers/DistributedNDArray.h +++ b/src/TNL/Containers/DistributedNDArray.h @@ -12,34 +12,30 @@ #pragma once -#include #include -#include #include namespace TNL { namespace Containers { template< typename NDArray, - typename Communicator = Communicators::MpiCommunicator, typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > > class DistributedNDArray { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = typename NDArray::ValueType; using DeviceType = typename NDArray::DeviceType; using IndexType = typename NDArray::IndexType; + using AllocatorType = typename NDArray::AllocatorType; using SizesHolderType = typename NDArray::SizesHolderType; using PermutationType = typename NDArray::PermutationType; - using CommunicatorType = Communicator; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; using OverlapsType = Overlaps; using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >; - using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >; + using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Overlaps >; using LocalViewType = typename NDArray::ViewType; using ConstLocalViewType = typename NDArray::ConstViewType; @@ -49,10 +45,17 @@ public: DistributedNDArray() = default; - // The copy-constructor of TNL::Containers::Array makes shallow copy so our - // copy-constructor cannot be default. Actually, we most likely don't need - // it anyway, so let's just delete it. - DistributedNDArray( const DistributedNDArray& ) = delete; + DistributedNDArray( const AllocatorType& allocator ); + + // Copy constructor (makes a deep copy). + explicit DistributedNDArray( const DistributedNDArray& ) = default; + + // Copy constructor with a specific allocator (makes a deep copy). + explicit DistributedNDArray( const DistributedNDArray& other, const AllocatorType& allocator ) + : localArray( allocator ) + { + *this = other; + } // Standard copy-semantics with deep copy, just like regular 1D array. // Mismatched sizes cause reallocations. @@ -79,8 +82,13 @@ public: return NDArray::getDimension(); } + AllocatorType getAllocator() const + { + return localArray.getAllocator(); + } + __cuda_callable__ - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -232,8 +240,8 @@ public: localEnds == other.localEnds && localArray == other.localArray; bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } @@ -375,7 +383,7 @@ public: } template< std::size_t level > - void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup ) + void setDistribution( IndexType begin, IndexType end, MPI_Comm group = MPI::AllGroup() ) { static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." ); TNL_ASSERT_GE( begin, 0, "begin must be non-negative" ); @@ -383,7 +391,7 @@ public: TNL_ASSERT_LT( begin, end, "begin must be lesser than end" ); localBegins.template setSize< level >( begin ); localEnds.template setSize< level >( end ); - TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group, + TNL_ASSERT( this->group == MPI::NullGroup() || this->group == group, std::cerr << "different groups cannot be combined for different dimensions" ); this->group = group; } @@ -408,7 +416,7 @@ public: void reset() { localArray.reset(); - group = CommunicatorType::NullGroup; + group = MPI::NullGroup(); globalSizes = SizesHolderType{}; localBegins = LocalBeginsType{}; localEnds = SizesHolderType{}; @@ -435,7 +443,7 @@ public: protected: NDArray localArray; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); SizesHolderType globalSizes; // static sizes should have different type: localBegin is always 0, localEnd is always the full size LocalBeginsType localBegins; diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h index bcec4a7b4..cea40bc21 100644 --- a/src/TNL/Containers/DistributedNDArraySynchronizer.h +++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h @@ -15,6 +15,7 @@ #include #include +#include namespace TNL { namespace Containers { @@ -69,7 +70,6 @@ public: protected: using DistributedNDArrayView = typename DistributedNDArray::ViewType; - using Communicator = typename DistributedNDArray::CommunicatorType; using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >; DistributedNDArrayView array_view; @@ -88,12 +88,12 @@ protected: Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true ); // issue all send and receive async operations - std::vector< typename Communicator::Request > requests; - const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); + std::vector< MPI_Request > requests; + const MPI_Comm group = array_view.getCommunicationGroup(); Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group ); // wait until send is done - Communicator::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); // copy data from receive buffers Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false ); @@ -152,9 +152,9 @@ protected: dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() ); // FIXME: set proper neighbor IDs !!! - const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); - const int rank = Communicator::GetRank(group); - const int nproc = Communicator::GetSize(group); + const MPI_Comm group = array_view.getCommunicationGroup(); + const int rank = MPI::GetRank(group); + const int nproc = MPI::GetSize(group); dim_buffers.left_neighbor = (rank + nproc - 1) % nproc; dim_buffers.right_neighbor = (rank + 1) % nproc; } @@ -221,32 +221,32 @@ protected: auto& dim_buffers = buffers.template getDimBuffers< dim >(); if( LBM_HACK == false ) { - requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData(), - dim_buffers.left_send_view.getStorageSize(), - dim_buffers.left_neighbor, 0, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData(), - dim_buffers.left_recv_view.getStorageSize(), - dim_buffers.left_neighbor, 1, group ) ); - requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData(), - dim_buffers.right_send_view.getStorageSize(), - dim_buffers.right_neighbor, 1, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData(), - dim_buffers.right_recv_view.getStorageSize(), - dim_buffers.right_neighbor, 0, group ) ); + requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(), + dim_buffers.left_send_view.getStorageSize(), + dim_buffers.left_neighbor, 0, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(), + dim_buffers.left_recv_view.getStorageSize(), + dim_buffers.left_neighbor, 1, group ) ); + requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(), + dim_buffers.right_send_view.getStorageSize(), + dim_buffers.right_neighbor, 1, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(), + dim_buffers.right_recv_view.getStorageSize(), + dim_buffers.right_neighbor, 0, group ) ); } else { - requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData() + 0, - dim_buffers.left_send_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 0, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, - dim_buffers.left_recv_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 1, group ) ); - requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, - dim_buffers.right_send_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 1, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData() + 0, - dim_buffers.right_recv_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 0, group ) ); + requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0, + dim_buffers.left_send_view.getStorageSize() / 27 * 9, + dim_buffers.left_neighbor, 0, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, + dim_buffers.left_recv_view.getStorageSize() / 27 * 9, + dim_buffers.left_neighbor, 1, group ) ); + requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, + dim_buffers.right_send_view.getStorageSize() / 27 * 9, + dim_buffers.right_neighbor, 1, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0, + dim_buffers.right_recv_view.getStorageSize() / 27 * 9, + dim_buffers.right_neighbor, 0, group ) ); } } }; diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h index 102985e9c..4812bf5c0 100644 --- a/src/TNL/Containers/DistributedNDArrayView.h +++ b/src/TNL/Containers/DistributedNDArrayView.h @@ -12,33 +12,30 @@ #pragma once -#include #include #include +#include namespace TNL { namespace Containers { template< typename NDArrayView, - typename Communicator = Communicators::MpiCommunicator, typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > > class DistributedNDArrayView { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = typename NDArrayView::ValueType; using DeviceType = typename NDArrayView::DeviceType; using IndexType = typename NDArrayView::IndexType; using SizesHolderType = typename NDArrayView::SizesHolderType; using PermutationType = typename NDArrayView::PermutationType; - using CommunicatorType = Communicator; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; using OverlapsType = Overlaps; using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >; - using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >; + using ViewType = DistributedNDArrayView< NDArrayView, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Overlaps >; using LocalViewType = NDArrayView; using ConstLocalViewType = typename NDArrayView::ConstViewType; @@ -49,7 +46,7 @@ public: // explicit initialization by local array view, global sizes and local begins and ends __cuda_callable__ - DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group ) + DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, MPI_Comm group ) : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {} // Copy-constructor does shallow copy, so views can be passed-by-value into @@ -112,7 +109,7 @@ public: void reset() { localView.reset(); - group = CommunicatorType::NullGroup; + group = MPI::NullGroup(); globalSizes = SizesHolderType{}; localBegins = LocalBeginsType{}; localEnds = SizesHolderType{}; @@ -124,7 +121,7 @@ public: } __cuda_callable__ - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -276,8 +273,8 @@ public: localEnds == other.localEnds && localView == other.localView; bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } @@ -406,7 +403,7 @@ public: protected: NDArrayView localView; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); SizesHolderType globalSizes; // static sizes should have different type: localBegin is always 0, localEnd is always the full size LocalBeginsType localBegins; diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h index 7b8a2f31c..f8ba157ba 100644 --- a/src/TNL/Containers/NDArray.h +++ b/src/TNL/Containers/NDArray.h @@ -59,10 +59,8 @@ public: NDArrayStorage() = default; - // The copy-constructor of TNL::Containers::Array makes shallow copy so our - // copy-constructor cannot be default. Actually, we most likely don't need - // it anyway, so let's just delete it. - NDArrayStorage( const NDArrayStorage& ) = delete; + // Copy constructor (makes a deep copy). + explicit NDArrayStorage( const NDArrayStorage& ) = default; // Standard copy-semantics with deep copy, just like regular 1D array. // Mismatched sizes cause reallocations. @@ -326,21 +324,49 @@ template< typename Value, typename SizesHolder, typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default typename Device = Devices::Host, - typename Index = typename SizesHolder::IndexType > + typename Index = typename SizesHolder::IndexType, + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class NDArray -: public NDArrayStorage< Array< Value, Device, Index >, +: public NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > { - using Base = NDArrayStorage< Array< Value, Device, Index >, + using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; public: - // inherit all assignment operators + // inherit all constructors and assignment operators + using Base::Base; using Base::operator=; + + // default constructor + NDArray() = default; + + // implement dynamic array interface + using AllocatorType = Allocator; + + NDArray( const NDArray& allocator ) + { + // set empty array containing the specified allocator + this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator ); + } + + // Copy constructor with a specific allocator (makes a deep copy). + explicit NDArray( const NDArray& other, const AllocatorType& allocator ) + { + // set empty array containing the specified allocator + this->array = Array< Value, Device, Index, Allocator >( allocator ); + // copy the data + *this = other; + } + + AllocatorType getAllocator() const + { + return this->array.getAllocator(); + } }; template< typename Value, @@ -372,21 +398,49 @@ template< typename Value, typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default typename SliceInfo = SliceInfo<>, // no slicing by default typename Device = Devices::Host, - typename Index = typename SizesHolder::IndexType > + typename Index = typename SizesHolder::IndexType, + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class SlicedNDArray -: public NDArrayStorage< Array< Value, Device, Index >, +: public NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::SlicedNDArrayBase< SliceInfo > > { - using Base = NDArrayStorage< Array< Value, Device, Index >, + using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::SlicedNDArrayBase< SliceInfo > >; public: - // inherit all assignment operators + // inherit all constructors and assignment operators + using Base::Base; using Base::operator=; + + // default constructor + SlicedNDArray() = default; + + // implement dynamic array interface + using AllocatorType = Allocator; + + SlicedNDArray( const SlicedNDArray& allocator ) + { + // set empty array containing the specified allocator + this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator ); + } + + // Copy constructor with a specific allocator (makes a deep copy). + explicit SlicedNDArray( const SlicedNDArray& other, const AllocatorType& allocator ) + { + // set empty array containing the specified allocator + this->array = Array< Value, Device, Index, Allocator >( allocator ); + // copy the data + *this = other; + } + + AllocatorType getAllocator() const + { + return this->array.getAllocator(); + } }; } // namespace Containers diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h index 32ba735e5..c2dce9e34 100644 --- a/src/TNL/Containers/Partitioner.h +++ b/src/TNL/Containers/Partitioner.h @@ -22,7 +22,7 @@ namespace TNL { namespace Containers { -template< typename Index, typename Communicator > +template< typename Index, typename Communicator = Communicators::MpiCommunicator > class Partitioner { using CommunicationGroup = typename Communicator::CommunicationGroup; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h index 366535cc7..36c4ea5b7 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -9,7 +9,6 @@ #ifdef HAVE_GTEST #include -#include #include #include #include @@ -33,7 +32,6 @@ class DistributedNDArrayOverlaps_1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -44,17 +42,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} ); - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArrayOverlaps_1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( globalSize ); distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -70,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Host >, - Communicators::MpiCommunicator, std::index_sequence< 2 > > #ifdef HAVE_CUDA , @@ -78,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Cuda >, - Communicators::MpiCommunicator, std::index_sequence< 2 > > #endif >; @@ -87,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h index aba9420f0..0b6838639 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -9,7 +9,6 @@ #ifdef HAVE_GTEST #include -#include #include #include #include @@ -33,7 +32,6 @@ class DistributedNDArrayOverlaps_semi1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -44,17 +42,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} ); - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArrayOverlaps_semi1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -70,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 9, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Host >, - Communicators::MpiCommunicator, std::index_sequence< 0, 2, 0 > > #ifdef HAVE_CUDA , @@ -78,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 9, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Cuda >, - Communicators::MpiCommunicator, std::index_sequence< 0, 2, 0 > > #endif >; @@ -87,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayType TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h index 3c637de4d..e55192971 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -9,7 +9,6 @@ #ifdef HAVE_GTEST #include -#include #include #include #include @@ -32,7 +31,6 @@ class DistributedNDArray_1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -42,17 +40,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArray_1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( globalSize ); distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -67,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Host >, - Communicators::MpiCommunicator > + Devices::Host > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Cuda >, - Communicators::MpiCommunicator > + Devices::Cuda > > #endif >; @@ -83,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); } diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h index 93d6c3036..e3cbb3223 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -9,7 +9,6 @@ #ifdef HAVE_GTEST #include -#include #include #include #include @@ -32,7 +31,6 @@ class DistributedNDArray_semi1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -42,17 +40,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArray_semi1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -67,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Host >, - Communicators::MpiCommunicator > + Devices::Host > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Cuda >, - Communicators::MpiCommunicator > + Devices::Cuda > > #endif >; @@ -83,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); } -- GitLab From ee2fd25dc248f65751ec4936a8227c67be45be4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 2 Jan 2021 10:11:55 +0100 Subject: [PATCH 43/50] MPI refactoring: removed MpiCommunicator from algebraic data structures This affects DistributedArray, DistributedArrayView, DistributedVector, DistributedVectorView and DistributedMatrix. Allocators were added to DistributedArray and DistributedVector. Also updated all benchmarks and unit tests. --- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 30 ++- src/Benchmarks/LinearSolvers/benchmarks.h | 6 +- .../tnl-benchmark-linear-solvers.h | 24 +- src/Benchmarks/ODESolvers/Euler.hpp | 2 +- src/Benchmarks/ODESolvers/Merson.hpp | 10 +- .../ODESolvers/tnl-benchmark-ode-solvers.h | 16 +- src/TNL/Algorithms/DistributedScan.h | 10 +- src/TNL/Containers/DistributedArray.h | 49 +++-- src/TNL/Containers/DistributedArray.hpp | 206 +++++++++-------- src/TNL/Containers/DistributedArrayView.h | 26 +-- src/TNL/Containers/DistributedArrayView.hpp | 208 ++++++++---------- src/TNL/Containers/DistributedVector.h | 26 ++- src/TNL/Containers/DistributedVector.hpp | 118 +++++----- src/TNL/Containers/DistributedVectorView.h | 24 +- src/TNL/Containers/DistributedVectorView.hpp | 137 +++++------- .../Expressions/DistributedComparison.h | 62 +++--- .../DistributedExpressionTemplates.h | 16 +- .../DistributedVerticalOperations.h | 60 +++-- src/TNL/Containers/Partitioner.h | 31 ++- src/TNL/Matrices/DistributedMatrix.h | 42 +--- src/TNL/Matrices/DistributedMatrix_impl.h | 150 ++++++------- src/TNL/Matrices/DistributedSpMV.h | 44 ++-- .../Solvers/Linear/Preconditioners/Diagonal.h | 8 +- .../Linear/Preconditioners/Diagonal_impl.h | 8 +- src/TNL/Solvers/Linear/Preconditioners/ILU0.h | 8 +- src/TNL/Solvers/Linear/Traits.h | 18 +- src/TNL/TypeTraits.h | 17 ++ .../Containers/DistributedArrayTest.h | 21 +- .../Containers/DistributedVectorTest.h | 21 +- .../Containers/VectorBinaryOperationsTest.h | 51 ++--- .../Containers/VectorHelperFunctions.h | 1 + .../Containers/VectorUnaryOperationsTest.h | 42 ++-- .../Containers/VectorVerticalOperationsTest.h | 29 ++- .../Matrices/DistributedMatrixTest.h | 23 +- 34 files changed, 741 insertions(+), 803 deletions(-) diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index abe08210d..e8b5c9de1 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -38,8 +38,6 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index, using namespace TNL; using namespace TNL::Benchmarks; -using CommunicatorType = Communicators::MpiCommunicator; - template< typename Matrix, typename Vector > void @@ -110,7 +108,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, // benchmark function auto compute = [&]() { matrix.vectorProduct( x, y ); - Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); + TNL::MPI::Barrier( matrix.getCommunicationGroup() ); }; benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); @@ -150,9 +148,9 @@ struct SpmvBenchmark using IndexType = typename MatrixType::IndexType; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >; - using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using Partitioner = Containers::Partitioner< IndexType >; + using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector; static bool @@ -169,7 +167,7 @@ struct SpmvBenchmark matrix.getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (CommunicatorType::isDistributed()) ? "DistSpMV" : "SpMV" ) + const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" ) + " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ @@ -189,13 +187,13 @@ struct SpmvBenchmark getTrivialOrdering( matrix, perm, iperm ); MatrixType matrix_perm; Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm ); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix_perm, vector ); else runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector ); } else { - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix, vector ); else runNonDistributed( benchmark, metadata, parameters, matrix, vector ); @@ -225,7 +223,7 @@ struct SpmvBenchmark VectorType& vector ) { // set up the distributed matrix - const auto group = CommunicatorType::AllGroup; + const auto group = TNL::MPI::AllGroup(); const auto localRange = Partitioner::splitRange( matrix.getRows(), group ); DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group ); DistributedVector distributedVector( localRange, 0, matrix.getRows(), group ); @@ -267,8 +265,8 @@ struct SpmvBenchmark DistributedVector distributedY; distributedY.setLike( distributedVector ); distributedMatrix.vectorProduct( distributedVector, distributedY ); - const int rank = CommunicatorType::GetRank( distributedMatrix.getCommunicationGroup() ); - const int nproc = CommunicatorType::GetSize( distributedMatrix.getCommunicationGroup() ); + const int rank = TNL::MPI::GetRank( distributedMatrix.getCommunicationGroup() ); + const int nproc = TNL::MPI::GetSize( distributedMatrix.getCommunicationGroup() ); typename VectorType::ViewType subY( &y[ Partitioner::getOffset( matrix.getRows(), rank, nproc ) ], Partitioner::getSizeForRank( matrix.getRows(), rank, nproc ) ); TNL_ASSERT_EQ( distributedY.getLocalView(), subY, "WRONG RESULT !!!" ); @@ -294,7 +292,7 @@ configSetup( Config::ConfigDescription & config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int @@ -310,14 +308,14 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); TNL::MPI::ScopedInitializer mpi(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a4c04578d..c10c996e3 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -33,10 +33,10 @@ void barrier( const Matrix& matrix ) { } -template< typename Matrix, typename Communicator > -void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix ) +template< typename Matrix > +void barrier( const Matrices::DistributedMatrix< Matrix >& matrix ) { - Communicator::Barrier( matrix.getCommunicationGroup() ); + TNL::MPI::Barrier( matrix.getCommunicationGroup() ); } template< typename Device > diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 75b1e0e25..3acfb2438 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -65,8 +65,6 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -using CommunicatorType = Communicators::MpiCommunicator; - static const std::set< std::string > valid_solvers = { "gmres", @@ -333,9 +331,9 @@ struct LinearSolversBenchmark using IndexType = typename MatrixType::IndexType; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >; - using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using Partitioner = Containers::Partitioner< IndexType >; + using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector; static bool @@ -383,7 +381,7 @@ struct LinearSolversBenchmark matrixPointer->getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" ) + const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" ) + " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ @@ -408,13 +406,13 @@ struct LinearSolversBenchmark Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm ); Matrices::reorderArray( x0, x0_perm, perm ); Matrices::reorderArray( b, b_perm, perm ); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); else runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); } else { - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); else runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); @@ -432,7 +430,7 @@ struct LinearSolversBenchmark const VectorType& b ) { // set up the distributed matrix - const auto group = CommunicatorType::AllGroup; + const auto group = TNL::MPI::AllGroup(); const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group ); SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group ); DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group ); @@ -567,7 +565,7 @@ configSetup( Config::ConfigDescription& config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); config.addDelimiter( "Linear solver settings:" ); Solvers::IterativeSolver< double, int >::configSetup( config ); @@ -593,13 +591,13 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); TNL::MPI::ScopedInitializer mpi(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp index 5039417b7..fcc8654be 100644 --- a/src/Benchmarks/ODESolvers/Euler.hpp +++ b/src/Benchmarks/ODESolvers/Euler.hpp @@ -200,7 +200,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, } localResidue /= tau * ( RealType ) size; - Problem::CommunicatorType::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup ); + TNL::MPI::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, TNL::MPI::AllGroup() ); //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl; } diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp index 1fd8f8a2b..b45faa1b4 100644 --- a/src/Benchmarks/ODESolvers/Merson.hpp +++ b/src/Benchmarks/ODESolvers/Merson.hpp @@ -185,13 +185,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u ) time += currentTau; computeNewTimeLevel( time, currentTau, u, newResidue ); this->setResidue( newResidue ); - + /**** * When time is close to stopTime the new residue * may be inaccurate significantly. */ if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); - + if( ! this->nextIteration() ) return false; @@ -207,7 +207,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u ) currentTau = min( currentTau, this->getMaxTau() ); #ifdef USE_MPI TNLMPI::Bcast( currentTau, 1, 0 ); -#endif +#endif } if( time + currentTau > this->getStopTime() ) currentTau = this->getStopTime() - time; //we don't want to keep such tau @@ -403,7 +403,7 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con } #endif } - Problem::CommunicatorType::Allreduce( &eps, &maxEps, 1, MPI_MAX, Problem::CommunicatorType::AllGroup ); + TNL::MPI::Allreduce( &eps, &maxEps, 1, MPI_MAX, TNL::MPI::AllGroup() ); return maxEps; } @@ -465,7 +465,7 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time, } localResidue /= tau * ( RealType ) size; - Problem::CommunicatorType::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup); + TNL::MPI::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, TNL::MPI::AllGroup() ); /*#ifdef USE_MPI TNLMPI::Allreduce( localResidue, currentResidue, 1, MPI_SUM); #else diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h index fcaaaedf2..0d8d3c04e 100644 --- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h +++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h @@ -23,8 +23,8 @@ #include #include #include -#include #include +#include #include #include @@ -38,8 +38,6 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -using CommunicatorType = Communicators::MpiCommunicator; - template< typename Real, typename Index > void @@ -113,7 +111,7 @@ struct ODESolversBenchmark Benchmark::MetadataMap metadata, const Config::ParameterContainer& parameters ) { - const String name = String( (CommunicatorType::isDistributed()) ? "Distributed ODE solvers" : "ODE solvers" ); + const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" ); //+ " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) { @@ -122,7 +120,7 @@ struct ODESolversBenchmark { "DOFs", convertToString( dofs ) }, } )); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, dofs ); else runNonDistributed( benchmark, metadata, parameters, dofs ); @@ -136,7 +134,7 @@ struct ODESolversBenchmark const Config::ParameterContainer& parameters, size_t dofs ) { - //const auto group = CommunicatorType::AllGroup; + //const auto group = TNL::MPI::AllGroup(); std::cout << "Iterative solvers:" << std::endl; benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs ); @@ -204,7 +202,7 @@ configSetup( Config::ConfigDescription& config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); config.addDelimiter( "ODE solver settings:" ); Solvers::IterativeSolver< double, int >::configSetup( config ); @@ -226,13 +224,13 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); TNL::MPI::ScopedInitializer mpi(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h index 742acd5ed..aa7c008a7 100644 --- a/src/TNL/Algorithms/DistributedScan.h +++ b/src/TNL/Algorithms/DistributedScan.h @@ -14,6 +14,7 @@ #include #include +#include namespace TNL { namespace Algorithms { @@ -32,10 +33,9 @@ struct DistributedScan { using RealType = typename DistributedVector::RealType; using DeviceType = typename DistributedVector::DeviceType; - using CommunicatorType = typename DistributedVector::CommunicatorType; const auto group = v.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // adjust begin and end for the local range const auto localRange = v.getLocalRange(); begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin(); @@ -47,18 +47,18 @@ struct DistributedScan const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); // exchange local sums between ranks - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); RealType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; Containers::Vector< RealType, Devices::Host > rankSums( nproc ); // NOTE: exchanging general data types does not work with MPI - CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); + MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); // compute the scan of the per-rank sums Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); // perform second phase: shift by the per-block and per-rank offsets - const int rank = CommunicatorType::GetRank( group ); + const int rank = MPI::GetRank( group ); Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); } } diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h index 33e96ca9a..3947bfec4 100644 --- a/src/TNL/Containers/DistributedArray.h +++ b/src/TNL/Containers/DistributedArray.h @@ -21,22 +21,21 @@ namespace Containers { template< typename Value, typename Device = Devices::Host, typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class DistributedArray { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using LocalArrayType = Containers::Array< Value, Device, Index >; + using LocalArrayType = Containers::Array< Value, Device, Index, Allocator >; public: using ValueType = Value; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; + using AllocatorType = Allocator; using LocalRangeType = Subrange< Index >; using LocalViewType = Containers::ArrayView< Value, Device, Index >; using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; - using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; - using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using ViewType = DistributedArrayView< Value, Device, Index >; + using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >; using SynchronizerType = typename ViewType::SynchronizerType; /** @@ -45,26 +44,50 @@ public: template< typename _Value, typename _Device = Device, typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedArray< _Value, _Device, _Index, _Communicator >; + typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Value > > + using Self = DistributedArray< _Value, _Device, _Index, _Allocator >; ~DistributedArray(); + /** + * \brief Constructs an empty array with zero size. + */ DistributedArray() = default; - // Copy-constructor does deep copy. - DistributedArray( const DistributedArray& ); + /** + * \brief Constructs an empty array and sets the provided allocator. + * + * \param allocator The allocator to be associated with this array. + */ + explicit DistributedArray( const AllocatorType& allocator ); - DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + /** + * \brief Copy constructor (makes a deep copy). + * + * \param array The array to be copied. + */ + explicit DistributedArray( const DistributedArray& array ); - void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + /** + * \brief Copy constructor with a specific allocator (makes a deep copy). + * + * \param array The array to be copied. + * \param allocator The allocator to be associated with this array. + */ + explicit DistributedArray( const DistributedArray& array, const AllocatorType& allocator ); + + DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup(), const AllocatorType& allocator = AllocatorType() ); + + void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup() ); const LocalRangeType& getLocalRange() const; IndexType getGhosts() const; - CommunicationGroup getCommunicationGroup() const; + MPI_Comm getCommunicationGroup() const; + + AllocatorType getAllocator() const; /** * \brief Returns a modifiable view of the local part of the array. diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp index 61dc3eda0..e9ee12093 100644 --- a/src/TNL/Containers/DistributedArray.hpp +++ b/src/TNL/Containers/DistributedArray.hpp @@ -22,8 +22,8 @@ namespace Containers { template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: ~DistributedArray() { // Wait for pending async operation, otherwise the synchronizer would crash @@ -34,20 +34,43 @@ DistributedArray< Value, Device, Index, Communicator >:: template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( const Allocator& allocator ) +: localData( allocator ) +{ +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: DistributedArray( const DistributedArray& array ) { setLike( array ); - localData = array.getConstLocalViewWithGhosts(); + view = array; } template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: -DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group ) + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( const DistributedArray& array, const Allocator& allocator ) +: localData( allocator ) +{ + setLike( array ); + view = array; +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, const Allocator& allocator ) +: localData( allocator ) { setDistribution( localRange, ghosts, globalSize, group ); } @@ -55,13 +78,13 @@ DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalS template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: -setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group ) +DistributedArray< Value, Device, Index, Allocator >:: +setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group ) { TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" ); - if( group != Communicator::NullGroup ) + if( group != MPI::NullGroup() ) localData.setSize( localRange.getSize() + ghosts ); view.bind( localRange, ghosts, globalSize, group, localData.getView() ); } @@ -69,9 +92,9 @@ setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSi template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > const Subrange< Index >& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getLocalRange() const { return view.getLocalRange(); @@ -80,9 +103,9 @@ getLocalRange() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > Index -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getGhosts() const { return view.getGhosts(); @@ -91,9 +114,9 @@ getGhosts() const template< typename Value, typename Device, typename Index, - typename Communicator > -typename Communicator::CommunicationGroup -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +MPI_Comm +DistributedArray< Value, Device, Index, Allocator >:: getCommunicationGroup() const { return view.getCommunicationGroup(); @@ -102,9 +125,20 @@ getCommunicationGroup() const template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +Allocator +DistributedArray< Value, Device, Index, Allocator >:: +getAllocator() const +{ + return localData.getAllocator(); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getLocalView() { return view.getLocalView(); @@ -113,9 +147,9 @@ getLocalView() template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getConstLocalView() const { return view.getConstLocalView(); @@ -124,9 +158,9 @@ getConstLocalView() const template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getLocalViewWithGhosts() { return view.getLocalViewWithGhosts(); @@ -135,9 +169,9 @@ getLocalViewWithGhosts() template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getConstLocalViewWithGhosts() const { return view.getConstLocalViewWithGhosts(); @@ -147,9 +181,9 @@ getConstLocalViewWithGhosts() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: copyFromGlobal( ConstLocalViewType globalArray ) { view.copyFromGlobal( globalArray ); @@ -158,9 +192,9 @@ copyFromGlobal( ConstLocalViewType globalArray ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) { view.setSynchronizer( synchronizer, valuesPerElement ); @@ -169,9 +203,9 @@ setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPer template< typename Value, typename Device, typename Index, - typename Communicator > -std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType > +DistributedArray< Value, Device, Index, Allocator >:: getSynchronizer() const { return view.getSynchronizer(); @@ -180,9 +214,9 @@ getSynchronizer() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > int -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getValuesPerElement() const { return view.getValuesPerElement(); @@ -191,9 +225,9 @@ getValuesPerElement() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: startSynchronization() { view.startSynchronization(); @@ -202,9 +236,9 @@ startSynchronization() template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: waitForSynchronization() const { view.waitForSynchronization(); @@ -218,9 +252,9 @@ waitForSynchronization() const template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ViewType +DistributedArray< Value, Device, Index, Allocator >:: getView() { return view; @@ -229,9 +263,9 @@ getView() template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstViewType +DistributedArray< Value, Device, Index, Allocator >:: getConstView() const { return view.getConstView(); @@ -240,8 +274,8 @@ getConstView() const template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: operator ViewType() { return getView(); @@ -250,8 +284,8 @@ operator ViewType() template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: operator ConstViewType() const { return getConstView(); @@ -260,10 +294,10 @@ operator ConstViewType() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setLike( const Array& array ) { localData.setLike( array.getConstLocalViewWithGhosts() ); @@ -276,9 +310,9 @@ setLike( const Array& array ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: reset() { view.reset(); @@ -288,9 +322,9 @@ reset() template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: empty() const { return view.empty(); @@ -299,9 +333,9 @@ empty() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > Index -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getSize() const { return view.getSize(); @@ -310,9 +344,9 @@ getSize() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setValue( ValueType value ) { view.setValue( value ); @@ -321,9 +355,9 @@ setValue( ValueType value ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setElement( IndexType i, ValueType value ) { view.setElement( i, value ); @@ -332,9 +366,9 @@ setElement( IndexType i, ValueType value ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > Value -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getElement( IndexType i ) const { return view.getElement( i ); @@ -343,10 +377,10 @@ getElement( IndexType i ) const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > __cuda_callable__ Value& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator[]( IndexType i ) { return view[ i ]; @@ -355,10 +389,10 @@ operator[]( IndexType i ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > __cuda_callable__ const Value& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator[]( IndexType i ) const { return view[ i ]; @@ -367,9 +401,9 @@ operator[]( IndexType i ) const template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >& -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >& +DistributedArray< Value, Device, Index, Allocator >:: operator=( const DistributedArray& array ) { setLike( array ); @@ -380,10 +414,10 @@ operator=( const DistributedArray& array ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array, typename..., typename > -DistributedArray< Value, Device, Index, Communicator >& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >& +DistributedArray< Value, Device, Index, Allocator >:: operator=( const Array& array ) { setLike( array ); @@ -394,10 +428,10 @@ operator=( const Array& array ) template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator==( const Array& array ) const { return view == array; @@ -406,10 +440,10 @@ operator==( const Array& array ) const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator!=( const Array& array ) const { return view != array; @@ -418,9 +452,9 @@ operator!=( const Array& array ) const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: containsValue( ValueType value ) const { return view.containsValue( value ); @@ -429,9 +463,9 @@ containsValue( ValueType value ) const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: containsOnlyValue( ValueType value ) const { return view.containsOnlyValue( value ); diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index 0a9aef1a4..cb3235ddb 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -15,30 +15,27 @@ #include #include -#include #include #include +#include namespace TNL { namespace Containers { template< typename Value, typename Device = Devices::Host, - typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Index = int > class DistributedArrayView { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = Value; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; using LocalRangeType = Subrange< Index >; using LocalViewType = Containers::ArrayView< Value, Device, Index >; using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; - using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; - using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using ViewType = DistributedArrayView< Value, Device, Index >; + using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >; using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >; /** @@ -46,15 +43,14 @@ public: */ template< typename _Value, typename _Device = Device, - typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >; + typename _Index = Index > + using Self = DistributedArrayView< _Value, _Device, _Index >; ~DistributedArrayView(); // Initialization by raw data - DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) + DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ) : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData) { TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, @@ -69,13 +65,13 @@ public: // "Templated copy-constructor" accepting any cv-qualification of Value template< typename Value_ > - DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& ); + DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& ); // default move-constructor DistributedArrayView( DistributedArrayView&& ) = default; // method for rebinding (reinitialization) to raw data - void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ); + void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ); // Note that you can also bind directly to DistributedArray and other types implicitly // convertible to DistributedArrayView. @@ -90,7 +86,7 @@ public: IndexType getGhosts() const; - CommunicationGroup getCommunicationGroup() const; + MPI_Comm getCommunicationGroup() const; LocalViewType getLocalView(); @@ -184,7 +180,7 @@ protected: LocalRangeType localRange; IndexType ghosts = 0; IndexType globalSize = 0; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); LocalViewType localData; std::shared_ptr< SynchronizerType > synchronizer = nullptr; diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 65654a54d..65ecc4101 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -19,9 +19,8 @@ namespace Containers { template< typename Value, typename Device, - typename Index, - typename Communicator > -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +DistributedArrayView< Value, Device, Index >:: ~DistributedArrayView() { // Wait for pending async operation, otherwise the synchronizer might crash @@ -33,11 +32,10 @@ DistributedArrayView< Value, Device, Index, Communicator >:: template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Value_ > -DistributedArrayView< Value, Device, Index, Communicator >:: -DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view ) +DistributedArrayView< Value, Device, Index >:: +DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& view ) : localRange( view.getLocalRange() ), ghosts( view.getGhosts() ), globalSize( view.getSize() ), @@ -49,11 +47,10 @@ DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communi template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: -bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) +DistributedArrayView< Value, Device, Index >:: +bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ) { TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); @@ -68,10 +65,9 @@ bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: bind( DistributedArrayView view ) { localRange = view.getLocalRange(); @@ -86,11 +82,10 @@ bind( DistributedArrayView view ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Value_ > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: bind( Value_* data, IndexType localSize ) { TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts, @@ -100,10 +95,9 @@ bind( Value_* data, IndexType localSize ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > const Subrange< Index >& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getLocalRange() const { return localRange; @@ -111,10 +105,9 @@ getLocalRange() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > Index -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getGhosts() const { return ghosts; @@ -122,10 +115,9 @@ getGhosts() const template< typename Value, typename Device, - typename Index, - typename Communicator > -typename Communicator::CommunicationGroup -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +MPI_Comm +DistributedArrayView< Value, Device, Index >:: getCommunicationGroup() const { return group; @@ -133,10 +125,9 @@ getCommunicationGroup() const template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::LocalViewType +DistributedArrayView< Value, Device, Index >:: getLocalView() { return LocalViewType( localData.getData(), localRange.getSize() ); @@ -144,10 +135,9 @@ getLocalView() template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType +DistributedArrayView< Value, Device, Index >:: getConstLocalView() const { return ConstLocalViewType( localData.getData(), localRange.getSize() ); @@ -155,10 +145,9 @@ getConstLocalView() const template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::LocalViewType +DistributedArrayView< Value, Device, Index >:: getLocalViewWithGhosts() { return localData; @@ -166,10 +155,9 @@ getLocalViewWithGhosts() template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType +DistributedArrayView< Value, Device, Index >:: getConstLocalViewWithGhosts() const { return localData; @@ -177,10 +165,9 @@ getConstLocalViewWithGhosts() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: copyFromGlobal( ConstLocalViewType globalArray ) { TNL_ASSERT_EQ( getSize(), globalArray.getSize(), @@ -200,10 +187,9 @@ copyFromGlobal( ConstLocalViewType globalArray ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) { this->synchronizer = synchronizer; @@ -212,10 +198,9 @@ setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPer template< typename Value, typename Device, - typename Index, - typename Communicator > -std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType > -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType > +DistributedArrayView< Value, Device, Index >:: getSynchronizer() const { return synchronizer; @@ -223,10 +208,9 @@ getSynchronizer() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > int -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getValuesPerElement() const { return valuesPerElement; @@ -234,10 +218,9 @@ getValuesPerElement() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: startSynchronization() { if( ghosts == 0 ) @@ -255,10 +238,9 @@ startSynchronization() template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: waitForSynchronization() const { if( synchronizer && synchronizer->async_op.valid() ) { @@ -271,10 +253,9 @@ waitForSynchronization() const template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::ViewType +DistributedArrayView< Value, Device, Index >:: getView() { return *this; @@ -282,10 +263,9 @@ getView() template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstViewType +DistributedArrayView< Value, Device, Index >:: getConstView() const { return *this; @@ -293,25 +273,23 @@ getConstView() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: reset() { localRange.reset(); ghosts = 0; globalSize = 0; - group = Communicator::NullGroup; + group = MPI::NullGroup(); localData.reset(); } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: empty() const { return getSize() == 0; @@ -321,10 +299,9 @@ empty() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > Index -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getSize() const { return globalSize; @@ -332,10 +309,9 @@ getSize() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: setValue( ValueType value ) { localData.setValue( value ); @@ -344,10 +320,9 @@ setValue( ValueType value ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: setElement( IndexType i, ValueType value ) { const IndexType li = localRange.getLocalIndex( i ); @@ -356,10 +331,9 @@ setElement( IndexType i, ValueType value ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > Value -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getElement( IndexType i ) const { const IndexType li = localRange.getLocalIndex( i ); @@ -368,11 +342,10 @@ getElement( IndexType i ) const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > __cuda_callable__ Value& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator[]( IndexType i ) { const IndexType li = localRange.getLocalIndex( i ); @@ -381,11 +354,10 @@ operator[]( IndexType i ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > __cuda_callable__ const Value& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator[]( IndexType i ) const { const IndexType li = localRange.getLocalIndex( i ); @@ -394,10 +366,9 @@ operator[]( IndexType i ) const template< typename Value, typename Device, - typename Index, - typename Communicator > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: + typename Index > +DistributedArrayView< Value, Device, Index >& +DistributedArrayView< Value, Device, Index >:: operator=( const DistributedArrayView& view ) { TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); @@ -413,11 +384,10 @@ operator=( const DistributedArrayView& view ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Array, typename..., typename > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >& +DistributedArrayView< Value, Device, Index >:: operator=( const Array& array ) { TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); @@ -433,11 +403,10 @@ operator=( const Array& array ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Array > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator==( const Array& array ) const { // we can't run allreduce if the communication groups are different @@ -450,18 +419,17 @@ operator==( const Array& array ) const // compare without ghosts getConstLocalView() == array.getConstLocalView(); bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Array > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator!=( const Array& array ) const { return ! (*this == array); @@ -469,32 +437,30 @@ operator!=( const Array& array ) const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: containsValue( ValueType value ) const { bool result = false; - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { const bool localResult = localData.containsValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group ); + MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group ); } return result; } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: containsOnlyValue( ValueType value ) const { bool result = true; - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { const bool localResult = localData.containsOnlyValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); } return result; } diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h index 32dc80125..8d737e3a9 100644 --- a/src/TNL/Containers/DistributedVector.h +++ b/src/TNL/Containers/DistributedVector.h @@ -21,21 +21,20 @@ namespace Containers { template< typename Real, typename Device = Devices::Host, typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Allocator = typename Allocators::Default< Device >::template Allocator< Real > > class DistributedVector -: public DistributedArray< Real, Device, Index, Communicator > +: public DistributedArray< Real, Device, Index, Allocator > { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using BaseType = DistributedArray< Real, Device, Index, Communicator >; + using BaseType = DistributedArray< Real, Device, Index, Allocator >; public: using RealType = Real; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; + using AllocatorType = Allocator; using LocalViewType = Containers::VectorView< Real, Device, Index >; using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >; - using ViewType = DistributedVectorView< Real, Device, Index, Communicator >; - using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >; + using ViewType = DistributedVectorView< Real, Device, Index >; + using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >; /** * \brief A template which allows to quickly obtain a \ref Vector type with changed template parameters. @@ -43,8 +42,8 @@ public: template< typename _Real, typename _Device = Device, typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedVector< _Real, _Device, _Index, _Communicator >; + typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Real > > + using Self = DistributedVector< _Real, _Device, _Index, _Allocator >; // inherit all constructors and assignment operators from Array @@ -60,6 +59,11 @@ public: */ explicit DistributedVector( const DistributedVector& ) = default; + /** + * \brief Copy constructor with a specific allocator (makes a deep copy). + */ + explicit DistributedVector( const DistributedVector& vector, const AllocatorType& allocator ); + /** * \brief Default move constructor. */ @@ -177,8 +181,8 @@ public: // Enable expression templates for DistributedVector namespace Expressions { - template< typename Real, typename Device, typename Index, typename Communicator > - struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Communicator > > + template< typename Real, typename Device, typename Index, typename Allocator > + struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Allocator > > : std::true_type {}; } // namespace Expressions diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index cbbc763ec..044b747d9 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -21,9 +21,19 @@ namespace Containers { template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Real, Device, Index, Allocator >:: +DistributedVector( const DistributedVector& vector, const AllocatorType& allocator ) +: BaseType::DistributedArray( vector, allocator ) +{ +} + +template< typename Real, + typename Device, + typename Index, + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getLocalView() { return BaseType::getLocalView(); @@ -32,9 +42,9 @@ getLocalView() template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getConstLocalView() const { return BaseType::getConstLocalView(); @@ -43,9 +53,9 @@ getConstLocalView() const template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getLocalViewWithGhosts() { return BaseType::getLocalViewWithGhosts(); @@ -54,9 +64,9 @@ getLocalViewWithGhosts() template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getConstLocalViewWithGhosts() const { return BaseType::getConstLocalViewWithGhosts(); @@ -65,9 +75,9 @@ getConstLocalViewWithGhosts() const template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Value, Device, Index, Communicator >::ViewType -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Value, Device, Index, Allocator >::ViewType +DistributedVector< Value, Device, Index, Allocator >:: getView() { return BaseType::getView(); @@ -76,9 +86,9 @@ getView() template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Value, Device, Index, Allocator >::ConstViewType +DistributedVector< Value, Device, Index, Allocator >:: getConstView() const { return BaseType::getConstView(); @@ -87,8 +97,8 @@ getConstView() const template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Value, Device, Index, Allocator >:: operator ViewType() { return getView(); @@ -97,8 +107,8 @@ operator ViewType() template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Value, Device, Index, Allocator >:: operator ConstViewType() const { return getConstView(); @@ -112,10 +122,10 @@ operator ConstViewType() const template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator=( const Vector& vector ) { this->setLike( vector ); @@ -126,10 +136,10 @@ operator=( const Vector& vector ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator+=( const Vector& vector ) { getView() += vector; @@ -139,10 +149,10 @@ operator+=( const Vector& vector ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator-=( const Vector& vector ) { getView() -= vector; @@ -152,10 +162,10 @@ operator-=( const Vector& vector ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator*=( const Vector& vector ) { getView() *= vector; @@ -165,10 +175,10 @@ operator*=( const Vector& vector ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator/=( const Vector& vector ) { getView() /= vector; @@ -178,10 +188,10 @@ operator/=( const Vector& vector ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator=( Scalar c ) { getView() = c; @@ -191,10 +201,10 @@ operator=( Scalar c ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator+=( Scalar c ) { getView() += c; @@ -204,10 +214,10 @@ operator+=( Scalar c ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator-=( Scalar c ) { getView() -= c; @@ -217,10 +227,10 @@ operator-=( Scalar c ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator*=( Scalar c ) { getView() *= c; @@ -230,10 +240,10 @@ operator*=( Scalar c ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator/=( Scalar c ) { getView() /= c; @@ -243,10 +253,10 @@ operator/=( Scalar c ) template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< Algorithms::ScanType Type > void -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >:: scan( IndexType begin, IndexType end ) { getView().template scan< Type >( begin, end ); diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index 6be52d9db..4a46a47ce 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -21,32 +21,28 @@ namespace Containers { template< typename Real, typename Device = Devices::Host, - typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Index = int > class DistributedVectorView -: public DistributedArrayView< Real, Device, Index, Communicator > +: public DistributedArrayView< Real, Device, Index > { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using BaseType = DistributedArrayView< Real, Device, Index, Communicator >; + using BaseType = DistributedArrayView< Real, Device, Index >; using NonConstReal = typename std::remove_const< Real >::type; public: using RealType = Real; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; using LocalViewType = Containers::VectorView< Real, Device, Index >; using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >; - using ViewType = DistributedVectorView< Real, Device, Index, Communicator >; - using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >; + using ViewType = DistributedVectorView< Real, Device, Index >; + using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >; /** * \brief A template which allows to quickly obtain a \ref VectorView type with changed template parameters. */ template< typename _Real, typename _Device = Device, - typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedVectorView< _Real, _Device, _Index, _Communicator >; + typename _Index = Index > + using Self = DistributedVectorView< _Real, _Device, _Index >; // inherit all constructors and assignment operators from ArrayView @@ -62,7 +58,7 @@ public: // initialization by base class is not a copy constructor so it has to be explicit template< typename Real_ > // template catches both const and non-const qualified Element - DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view ) + DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index >& view ) : BaseType( view ) {} /** @@ -156,8 +152,8 @@ public: // Enable expression templates for DistributedVector namespace Expressions { - template< typename Real, typename Device, typename Index, typename Communicator > - struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index, Communicator > > + template< typename Real, typename Device, typename Index > + struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index > > : std::true_type {}; } // namespace Expressions diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index f1a6fb1e5..2f9222f94 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -20,10 +20,9 @@ namespace Containers { template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::LocalViewType +DistributedVectorView< Real, Device, Index >:: getLocalView() { return BaseType::getLocalView(); @@ -31,10 +30,9 @@ getLocalView() template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType +DistributedVectorView< Real, Device, Index >:: getConstLocalView() const { return BaseType::getConstLocalView(); @@ -42,10 +40,9 @@ getConstLocalView() const template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::LocalViewType +DistributedVectorView< Real, Device, Index >:: getLocalViewWithGhosts() { return BaseType::getLocalViewWithGhosts(); @@ -53,10 +50,9 @@ getLocalViewWithGhosts() template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType +DistributedVectorView< Real, Device, Index >:: getConstLocalViewWithGhosts() const { return BaseType::getConstLocalViewWithGhosts(); @@ -64,10 +60,9 @@ getConstLocalViewWithGhosts() const template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType -DistributedVectorView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Value, Device, Index >::ViewType +DistributedVectorView< Value, Device, Index >:: getView() { return *this; @@ -75,10 +70,9 @@ getView() template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType -DistributedVectorView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Value, Device, Index >::ConstViewType +DistributedVectorView< Value, Device, Index >:: getConstView() const { return *this; @@ -91,11 +85,10 @@ getConstView() const template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), @@ -107,7 +100,7 @@ operator=( const Vector& vector ) TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "The communication groups of the array views must be equal." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { // TODO: it might be better to split the local and ghost parts and synchronize in the middle this->waitForSynchronization(); vector.waitForSynchronization(); @@ -118,11 +111,10 @@ operator=( const Vector& vector ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator+=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), @@ -134,7 +126,7 @@ operator+=( const Vector& vector ) TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { // TODO: it might be better to split the local and ghost parts and synchronize in the middle this->waitForSynchronization(); vector.waitForSynchronization(); @@ -145,11 +137,10 @@ operator+=( const Vector& vector ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator-=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), @@ -161,7 +152,7 @@ operator-=( const Vector& vector ) TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { // TODO: it might be better to split the local and ghost parts and synchronize in the middle this->waitForSynchronization(); vector.waitForSynchronization(); @@ -172,11 +163,10 @@ operator-=( const Vector& vector ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator*=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), @@ -188,7 +178,7 @@ operator*=( const Vector& vector ) TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { // TODO: it might be better to split the local and ghost parts and synchronize in the middle this->waitForSynchronization(); vector.waitForSynchronization(); @@ -199,11 +189,10 @@ operator*=( const Vector& vector ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator/=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), @@ -215,7 +204,7 @@ operator/=( const Vector& vector ) TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { // TODO: it might be better to split the local and ghost parts and synchronize in the middle this->waitForSynchronization(); vector.waitForSynchronization(); @@ -226,14 +215,13 @@ operator/=( const Vector& vector ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() = c; this->startSynchronization(); } @@ -242,14 +230,13 @@ operator=( Scalar c ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator+=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() += c; this->startSynchronization(); } @@ -258,14 +245,13 @@ operator+=( Scalar c ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator-=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() -= c; this->startSynchronization(); } @@ -274,14 +260,13 @@ operator-=( Scalar c ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator*=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() *= c; this->startSynchronization(); } @@ -290,14 +275,13 @@ operator*=( Scalar c ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator/=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() /= c; this->startSynchronization(); } @@ -306,11 +290,10 @@ operator/=( Scalar c ) template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< Algorithms::ScanType Type > void -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >:: scan( IndexType begin, IndexType end ) { if( end == 0 ) diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h index 2695ccccc..10bf2d117 100644 --- a/src/TNL/Containers/Expressions/DistributedComparison.h +++ b/src/TNL/Containers/Expressions/DistributedComparison.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { @@ -43,8 +43,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression // compare without ghosts a.getConstLocalView() == b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -64,8 +64,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getConstLocalView() < b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -80,8 +80,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getConstLocalView() <= b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -96,8 +96,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getConstLocalView() > b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -112,8 +112,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getConstLocalView() >= b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } }; @@ -128,8 +128,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a == b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -142,8 +142,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a < b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -151,8 +151,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a <= b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -160,8 +160,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a > b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -169,8 +169,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a >= b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } }; @@ -185,8 +185,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() == b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -199,8 +199,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() < b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -208,8 +208,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() <= b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -217,8 +217,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() > b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -226,8 +226,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() >= b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } }; diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 25175a467..5f67084fd 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -59,8 +59,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using RealType = decltype( Operation::evaluate( std::declval()[0], std::declval()[0] ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, typename T2::ConstLocalViewType, @@ -115,7 +113,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getGhosts(); } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return op1.getCommunicationGroup(); } @@ -159,8 +157,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using RealType = decltype( Operation::evaluate( std::declval()[0], std::declval() ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >; using SynchronizerType = typename T1::SynchronizerType; @@ -199,7 +195,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getGhosts(); } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return op1.getCommunicationGroup(); } @@ -242,8 +238,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl using RealType = decltype( Operation::evaluate( std::declval(), std::declval()[0] ) ); using DeviceType = typename T2::DeviceType; using IndexType = typename T2::IndexType; - using CommunicatorType = typename T2::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T2::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >; using SynchronizerType = typename T2::SynchronizerType; @@ -282,7 +276,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl return op2.getGhosts(); } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return op2.getCommunicationGroup(); } @@ -326,8 +320,6 @@ struct DistributedUnaryExpressionTemplate using RealType = decltype( Operation::evaluate( std::declval()[0] ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >; using SynchronizerType = typename T1::SynchronizerType; @@ -366,7 +358,7 @@ struct DistributedUnaryExpressionTemplate return operand.getGhosts(); } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return operand.getCommunicationGroup(); } diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h index f55ae3d4a..903df1e1d 100644 --- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h +++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { @@ -21,14 +21,13 @@ template< typename Expression > auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionMin( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() ); } return result; } @@ -40,26 +39,25 @@ auto DistributedExpressionArgMin( const Expression& expression ) using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; using ResultType = std::pair< RealType, IndexType >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::max() ); const auto group = expression.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // compute local argMin ResultType localResult = ExpressionArgMin( expression.getConstLocalView() ); // transform local index to global index localResult.second += expression.getLocalRange().getBegin(); // scatter local result to all processes and gather their results - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); ResultType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult; ResultType gatheredResults[ nproc ]; // NOTE: exchanging general data types does not work with MPI - //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); - CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); + //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); + MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); // reduce the gathered data const auto* _data = gatheredResults; // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!) @@ -82,14 +80,13 @@ template< typename Expression > auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::lowest(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionMax( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() ); } return result; } @@ -101,26 +98,25 @@ auto DistributedExpressionArgMax( const Expression& expression ) using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; using ResultType = std::pair< RealType, IndexType >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::lowest() ); const auto group = expression.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // compute local argMax ResultType localResult = ExpressionArgMax( expression.getConstLocalView() ); // transform local index to global index localResult.second += expression.getLocalRange().getBegin(); // scatter local result to all processes and gather their results - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); ResultType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult; ResultType gatheredResults[ nproc ]; // NOTE: exchanging general data types does not work with MPI - //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); - CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); + //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); + MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); // reduce the gathered data const auto* _data = gatheredResults; // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!) @@ -143,12 +139,11 @@ template< typename Expression > auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionSum( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() ); } return result; } @@ -157,12 +152,11 @@ template< typename Expression > auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_t< decltype( expression[0] * expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 1; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionProduct( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() ); } return result; } @@ -171,14 +165,13 @@ template< typename Expression > auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] && expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() ); } return result; } @@ -187,12 +180,11 @@ template< typename Expression > auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] || expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] || expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() ); } return result; } @@ -201,14 +193,13 @@ template< typename Expression > auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() ); } return result; } @@ -217,12 +208,11 @@ template< typename Expression > auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] | expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() ); } return result; } diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h index c2dce9e34..6d3605b5a 100644 --- a/src/TNL/Containers/Partitioner.h +++ b/src/TNL/Containers/Partitioner.h @@ -22,18 +22,17 @@ namespace TNL { namespace Containers { -template< typename Index, typename Communicator = Communicators::MpiCommunicator > +template< typename Index > class Partitioner { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using SubrangeType = Subrange< Index >; - static SubrangeType splitRange( Index globalSize, CommunicationGroup group ) + static SubrangeType splitRange( Index globalSize, MPI_Comm group ) { - if( group != Communicator::NullGroup ) { - const int rank = Communicator::GetRank( group ); - const int partitions = Communicator::GetSize( group ); + if( group != MPI::NullGroup() ) { + const int rank = MPI::GetRank( group ); + const int partitions = MPI::GetSize( group ); const Index begin = TNL::min( globalSize, rank * globalSize / partitions ); const Index end = TNL::min( globalSize, (rank + 1) * globalSize / partitions ); return SubrangeType( begin, end ); @@ -78,7 +77,7 @@ public: SubrangeType localRange; int overlaps; - CommunicationGroup group; + MPI_Comm group; public: using ByteArrayView = typename Base::ByteArrayView; @@ -93,14 +92,14 @@ public: ArraySynchronizer() = delete; - ArraySynchronizer( SubrangeType localRange, int overlaps, CommunicationGroup group ) + ArraySynchronizer( SubrangeType localRange, int overlaps, MPI_Comm group ) : localRange(localRange), overlaps(overlaps), group(group) {} virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override { auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); - Communicator::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); } virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override @@ -108,30 +107,30 @@ public: TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps), "unexpected array size" ); - const int rank = Communicator::GetRank( group ); - const int nproc = Communicator::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); const int left = (rank > 0) ? rank - 1 : nproc - 1; const int right = (rank < nproc - 1) ? rank + 1 : 0; // buffer for asynchronous communication requests - std::vector< typename Communicator::Request > requests; + std::vector< MPI_Request > requests; // issue all async receive operations - requests.push_back( Communicator::IRecv( + requests.push_back( MPI::Irecv( array.getData() + bytesPerValue * localRange.getSize(), bytesPerValue * overlaps, left, 0, group ) ); - requests.push_back( Communicator::IRecv( + requests.push_back( MPI::Irecv( array.getData() + bytesPerValue * (localRange.getSize() + overlaps), bytesPerValue * overlaps, right, 0, group ) ); // issue all async send operations - requests.push_back( Communicator::ISend( + requests.push_back( MPI::Isend( array.getData(), bytesPerValue * overlaps, left, 0, group ) ); - requests.push_back( Communicator::ISend( + requests.push_back( MPI::Isend( array.getData() + bytesPerValue * (localRange.getSize() - overlaps), bytesPerValue * overlaps, right, 0, group ) ); diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h index 5731d11ca..61e4eabb6 100644 --- a/src/TNL/Matrices/DistributedMatrix.h +++ b/src/TNL/Matrices/DistributedMatrix.h @@ -14,7 +14,6 @@ #include -#include #include #include #include @@ -23,58 +22,39 @@ namespace TNL { namespace Matrices { -template< typename T, typename R = void > -struct enable_if_type -{ - using type = R; -}; - -template< typename T, typename Enable = void > -struct has_communicator : std::false_type {}; - -template< typename T > -struct has_communicator< T, typename enable_if_type< typename T::CommunicatorType >::type > -: std::true_type -{}; - - // TODO: 2D distribution for dense matrices (maybe it should be in different template, // because e.g. setRowFast doesn't make sense for dense matrices) -template< typename Matrix, - typename Communicator = Communicators::MpiCommunicator > +template< typename Matrix > class DistributedMatrix { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using MatrixType = Matrix; using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; - using CommunicatorType = Communicator; using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >; - using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >; + using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType >; using MatrixRow = typename Matrix::RowView; using ConstMatrixRow = typename Matrix::ConstRowView; template< typename _Real = RealType, typename _Device = DeviceType, - typename _Index = IndexType, - typename _Communicator = Communicator > - using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index >, _Communicator >; + typename _Index = IndexType > + using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index > >; DistributedMatrix() = default; DistributedMatrix( DistributedMatrix& ) = default; - DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup ); + DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() ); - void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup ); + void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() ); const LocalRangeType& getLocalRowRange() const; - CommunicationGroup getCommunicationGroup() const; + MPI_Comm getCommunicationGroup() const; const Matrix& getLocalMatrix() const; @@ -124,7 +104,7 @@ public: // multiplication with a global vector template< typename InVector, typename OutVector > - typename std::enable_if< ! has_communicator< InVector >::value >::type + typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type vectorProduct( const InVector& inVector, OutVector& outVector ) const; @@ -135,7 +115,7 @@ public: // (not const because it modifies internal bufers) template< typename InVector, typename OutVector > - typename std::enable_if< has_communicator< InVector >::value >::type + typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type vectorProduct( const InVector& inVector, OutVector& outVector ) const; @@ -149,10 +129,10 @@ public: protected: LocalRangeType localRowRange; IndexType rows = 0; // global rows count - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); Matrix localMatrix; - DistributedSpMV< Matrix, Communicator > spmv; + DistributedSpMV< Matrix > spmv; }; } // namespace Matrices diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h index b9638e002..8bc5d0982 100644 --- a/src/TNL/Matrices/DistributedMatrix_impl.h +++ b/src/TNL/Matrices/DistributedMatrix_impl.h @@ -17,60 +17,54 @@ namespace TNL { namespace Matrices { -template< typename Matrix, - typename Communicator > -DistributedMatrix< Matrix, Communicator >:: -DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group ) +template< typename Matrix > +DistributedMatrix< Matrix >:: +DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group ) { setDistribution( localRowRange, rows, columns, group ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: -setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group ) +DistributedMatrix< Matrix >:: +setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group ) { this->localRowRange = localRowRange; this->rows = rows; this->group = group; - if( group != Communicator::NullGroup ) + if( group != MPI::NullGroup() ) localMatrix.setDimensions( localRowRange.getSize(), columns ); spmv.reset(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > const Containers::Subrange< typename Matrix::IndexType >& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalRowRange() const { return localRowRange; } -template< typename Matrix, - typename Communicator > -typename Communicator::CommunicationGroup -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +MPI_Comm +DistributedMatrix< Matrix >:: getCommunicationGroup() const { return group; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > const Matrix& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalMatrix() const { return localMatrix; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > Matrix& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalMatrix() { return localMatrix; @@ -81,10 +75,9 @@ getLocalMatrix() * Some common Matrix methods follow below. */ -template< typename Matrix, - typename Communicator > -DistributedMatrix< Matrix, Communicator >& -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +DistributedMatrix< Matrix >& +DistributedMatrix< Matrix >:: operator=( const DistributedMatrix& matrix ) { setLike( matrix ); @@ -92,11 +85,10 @@ operator=( const DistributedMatrix& matrix ) return *this; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename MatrixT > -DistributedMatrix< Matrix, Communicator >& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >& +DistributedMatrix< Matrix >:: operator=( const MatrixT& matrix ) { setLike( matrix ); @@ -104,11 +96,10 @@ operator=( const MatrixT& matrix ) return *this; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename MatrixT > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setLike( const MatrixT& matrix ) { localRowRange = matrix.getLocalRowRange(); @@ -119,84 +110,77 @@ setLike( const MatrixT& matrix ) spmv.reset(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: reset() { localRowRange.reset(); rows = 0; - group = Communicator::NullGroup; + group = MPI::NullGroup(); localMatrix.reset(); spmv.reset(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getRows() const { return rows; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getColumns() const { return localMatrix.getColumns(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename RowCapacitiesVector > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setRowCapacities( const RowCapacitiesVector& rowCapacities ) { TNL_ASSERT_EQ( rowCapacities.getSize(), getRows(), "row lengths vector has wrong size" ); TNL_ASSERT_EQ( rowCapacities.getLocalRange(), getLocalRowRange(), "row lengths vector has wrong distribution" ); TNL_ASSERT_EQ( rowCapacities.getCommunicationGroup(), getCommunicationGroup(), "row lengths vector has wrong communication group" ); - if( getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( getCommunicationGroup() != MPI::NullGroup() ) { localMatrix.setRowCapacities( rowCapacities.getConstLocalView() ); spmv.reset(); } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename Vector > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getCompressedRowLengths( Vector& rowLengths ) const { - if( getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( getCommunicationGroup() != MPI::NullGroup() ) { rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() ); auto localRowLengths = rowLengths.getLocalView(); localMatrix.getCompressedRowLengths( localRowLengths ); } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getRowCapacity( IndexType row ) const { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRowCapacity( localRow ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setElement( IndexType row, IndexType column, RealType value ) @@ -205,10 +189,9 @@ setElement( IndexType row, localMatrix.setElement( localRow, column, value ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::RealType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getElement( IndexType row, IndexType column ) const { @@ -216,10 +199,9 @@ getElement( IndexType row, return localMatrix.getElement( localRow, column ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::RealType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getElementFast( IndexType row, IndexType column ) const { @@ -227,32 +209,29 @@ getElementFast( IndexType row, return localMatrix.getElementFast( localRow, column ); } -template< typename Matrix, - typename Communicator > -typename DistributedMatrix< Matrix, Communicator >::MatrixRow -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +typename DistributedMatrix< Matrix >::MatrixRow +DistributedMatrix< Matrix >:: getRow( IndexType row ) { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRow( localRow ); } -template< typename Matrix, - typename Communicator > -typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +typename DistributedMatrix< Matrix >::ConstMatrixRow +DistributedMatrix< Matrix >:: getRow( IndexType row ) const { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRow( localRow ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename InVector, typename OutVector > -typename std::enable_if< ! has_communicator< InVector >::value >::type -DistributedMatrix< Matrix, Communicator >:: +typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type +DistributedMatrix< Matrix >:: vectorProduct( const InVector& inVector, OutVector& outVector ) const { @@ -265,23 +244,21 @@ vectorProduct( const InVector& inVector, localMatrix.vectorProduct( inVector, outView ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: updateVectorProductCommunicationPattern() { - if( getCommunicationGroup() == CommunicatorType::NullGroup ) + if( getCommunicationGroup() == MPI::NullGroup() ) return; spmv.updateCommunicationPattern( getLocalMatrix(), getCommunicationGroup() ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename InVector, typename OutVector > -typename std::enable_if< has_communicator< InVector >::value >::type -DistributedMatrix< Matrix, Communicator >:: +typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type +DistributedMatrix< Matrix >:: vectorProduct( const InVector& inVector, OutVector& outVector ) const { @@ -291,7 +268,7 @@ vectorProduct( const InVector& inVector, TNL_ASSERT_EQ( outVector.getLocalRange(), getLocalRowRange(), "output vector has wrong distribution" ); TNL_ASSERT_EQ( outVector.getCommunicationGroup(), getCommunicationGroup(), "output vector has wrong communication group" ); - if( getCommunicationGroup() == CommunicatorType::NullGroup ) + if( getCommunicationGroup() == MPI::NullGroup() ) return; if( inVector.getGhosts() == 0 ) { @@ -314,11 +291,10 @@ vectorProduct( const InVector& inVector, } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename Vector1, typename Vector2 > bool -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: performSORIteration( const Vector1& b, const IndexType row, Vector2& x, diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h index 76aaa77fe..bea864ead 100644 --- a/src/TNL/Matrices/DistributedSpMV.h +++ b/src/TNL/Matrices/DistributedSpMV.h @@ -33,7 +33,7 @@ namespace TNL { namespace Matrices { -template< typename Matrix, typename Communicator > +template< typename Matrix > class DistributedSpMV { public: @@ -41,8 +41,6 @@ public: using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; - using CommunicatorType = Communicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >; // - communication pattern: vector components whose indices are in the range @@ -55,10 +53,10 @@ public: // - assembly of the i-th row involves traversal of the local matrix stored // in the i-th process // - assembly of the full matrix needs all-to-all communication - void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, CommunicationGroup group ) + void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, MPI_Comm group ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); commPatternStarts.setDimensions( nproc, nproc ); commPatternEnds.setDimensions( nproc, nproc ); @@ -67,9 +65,9 @@ public: { Containers::Array< IndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( localRowRange.getBegin() ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + group ); } const auto globalOffsetsView = globalOffsets.getConstView(); auto getOwner = [=] __cuda_callable__ ( IndexType global_idx ) -> int @@ -150,12 +148,12 @@ public: } // assemble the commPattern* matrices - CommunicatorType::Alltoall( &preCommPatternStarts(0, 0), nproc, - &commPatternStarts(0, 0), nproc, - group ); - CommunicatorType::Alltoall( &preCommPatternEnds(0, 0), nproc, - &commPatternEnds(0, 0), nproc, - group ); + MPI::Alltoall( &preCommPatternStarts(0, 0), nproc, + &commPatternStarts(0, 0), nproc, + group ); + MPI::Alltoall( &preCommPatternEnds(0, 0), nproc, + &commPatternEnds(0, 0), nproc, + group ); } template< typename InVector, @@ -164,10 +162,10 @@ public: const MatrixType& localMatrix, const LocalRangeType& localRowRange, const InVector& inVector, - CommunicationGroup group ) + MPI_Comm group ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // handle trivial case if( nproc == 1 ) { @@ -190,14 +188,14 @@ public: TNL_ASSERT_EQ( globalBuffer.getSize(), localMatrix.getColumns(), "the global buffer size does not match the number of matrix columns" ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > commRequests; + std::vector< MPI_Request > commRequests; // send our data to all processes that need it for( int i = 0; i < commPatternStarts.getRows(); i++ ) { if( i == rank ) continue; if( commPatternStarts( i, rank ) < commPatternEnds( i, rank ) ) - commRequests.push_back( CommunicatorType::ISend( + commRequests.push_back( MPI::Isend( inVector.getConstLocalView().getData() + commPatternStarts( i, rank ) - localRowRange.getBegin(), commPatternEnds( i, rank ) - commPatternStarts( i, rank ), i, 0, group ) ); @@ -208,7 +206,7 @@ public: if( j == rank ) continue; if( commPatternStarts( rank, j ) < commPatternEnds( rank, j ) ) - commRequests.push_back( CommunicatorType::IRecv( + commRequests.push_back( MPI::Irecv( globalBuffer.getPointer( commPatternStarts( rank, j ) ), commPatternEnds( rank, j ) - commPatternStarts( rank, j ), j, 0, group ) ); @@ -217,7 +215,7 @@ public: // general variant if( localOnlySpan.first >= localOnlySpan.second ) { // wait for all communications to finish - CommunicatorType::WaitAll( commRequests.data(), commRequests.size() ); + MPI::Waitall( commRequests.data(), commRequests.size() ); // perform matrix-vector multiplication auto outVectorView = outVector.getLocalView(); @@ -231,7 +229,7 @@ public: localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second ); // wait for all communications to finish - CommunicatorType::WaitAll( commRequests.data(), commRequests.size() ); + MPI::Waitall( commRequests.data(), commRequests.size() ); // finish the multiplication by adding the non-local entries localMatrix.vectorProduct( globalBuffer, outVectorView, 1.0, 0.0, 0, localOnlySpan.first ); diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h index f88e315cc..7c03dd7ce 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h @@ -42,12 +42,12 @@ protected: VectorType diagonal; }; -template< typename Matrix, typename Communicator > -class Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > > -: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +class Diagonal< Matrices::DistributedMatrix< Matrix > > +: public Preconditioner< Matrices::DistributedMatrix< Matrix > > { public: - using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >; + using MatrixType = Matrices::DistributedMatrix< Matrix >; using RealType = typename MatrixType::RealType; using DeviceType = typename MatrixType::DeviceType; using IndexType = typename MatrixType::IndexType; diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index d2227e57b..17746373a 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -53,9 +53,9 @@ solve( ConstVectorViewType b, VectorViewType x ) const } -template< typename Matrix, typename Communicator > +template< typename Matrix > void -Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >:: +Diagonal< Matrices::DistributedMatrix< Matrix > >:: update( const MatrixPointer& matrixPointer ) { TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" ); @@ -87,9 +87,9 @@ update( const MatrixPointer& matrixPointer ) } } -template< typename Matrix, typename Communicator > +template< typename Matrix > void -Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >:: +Diagonal< Matrices::DistributedMatrix< Matrix > >:: solve( ConstVectorViewType b, VectorViewType x ) const { ConstLocalViewType diag_view( diagonal ); diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index 857d8a063..a4eb9e8aa 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -194,11 +194,11 @@ protected: #endif }; -template< typename Matrix, typename Communicator > -class ILU0_impl< Matrices::DistributedMatrix< Matrix, Communicator >, double, Devices::Cuda, int > -: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +class ILU0_impl< Matrices::DistributedMatrix< Matrix >, double, Devices::Cuda, int > +: public Preconditioner< Matrices::DistributedMatrix< Matrix > > { - using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >; + using MatrixType = Matrices::DistributedMatrix< Matrix >; public: using RealType = double; using DeviceType = Devices::Cuda; diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 7a1879923..d98b78294 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -54,24 +54,21 @@ struct Traits static void waitForSynchronization( VectorViewType v ) {} }; -template< typename Matrix, typename Communicator > -struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +struct Traits< Matrices::DistributedMatrix< Matrix > > { using VectorType = Containers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using VectorViewType = Containers::DistributedVectorView < typename Matrix::RealType, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using ConstVectorViewType = Containers::DistributedVectorView < std::add_const_t< typename Matrix::RealType >, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using LocalVectorType = Containers::Vector < typename Matrix::RealType, @@ -87,12 +84,11 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > typename Matrix::IndexType >; // compatibility wrappers for some DistributedMatrix methods - static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) - { return m.getLocalMatrix(); } + static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getLocalMatrix(); } static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); } static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); } - static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } + static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getCommunicationGroup(); } static void startSynchronization( VectorViewType v ) { v.startSynchronization(); } static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); } }; diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h index 2afda7aad..63b8fc273 100644 --- a/src/TNL/TypeTraits.h +++ b/src/TNL/TypeTraits.h @@ -253,4 +253,21 @@ public: static constexpr bool value = type::value; }; +/** + * \brief Type trait for checking if T has getCommunicationGroup method. + */ +template< typename T > +class HasGetCommunicationGroupMethod +{ +private: + typedef char YesType[1]; + typedef char NoType[2]; + + template< typename C > static YesType& test( decltype(std::declval< C >().getCommunicationGroup()) ); + template< typename C > static NoType& test(...); + +public: + static constexpr bool value = ( sizeof( test< std::decay_t >(0) ) == sizeof( YesType ) ); +}; + } //namespace TNL diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index f594a081b..e25739afe 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -9,7 +9,6 @@ #ifdef HAVE_GTEST #include -#include #include #include @@ -17,6 +16,7 @@ using namespace TNL; using namespace TNL::Containers; +using namespace TNL::MPI; /* * Light check of DistributedArray. @@ -32,7 +32,6 @@ class DistributedArrayTest protected: using ValueType = typename DistributedArray::ValueType; using DeviceType = typename DistributedArray::DeviceType; - using CommunicatorType = typename DistributedArray::CommunicatorType; using IndexType = typename DistributedArray::IndexType; using DistributedArrayType = DistributedArray; using ArrayViewType = typename DistributedArrayType::LocalViewType; @@ -40,12 +39,12 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); DistributedArrayType distributedArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // some arbitrary even value (but must be 0 if not distributed) const int ghosts = (nproc > 1) ? 4 : 0; @@ -53,10 +52,10 @@ protected: DistributedArrayTest() { using LocalRangeType = typename DistributedArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedArray.setDistribution( localRange, ghosts, globalSize, group ); - using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >; + using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >; distributedArray.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); EXPECT_EQ( distributedArray.getLocalRange(), localRange ); @@ -67,10 +66,10 @@ protected: // types for which DistributedArrayTest is instantiated using DistributedArrayTypes = ::testing::Types< - DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator > + DistributedArray< double, Devices::Host, int > #ifdef HAVE_CUDA , - DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator > + DistributedArray< double, Devices::Cuda, int > #endif >; @@ -86,11 +85,9 @@ TYPED_TEST( DistributedArrayTest, checkLocalSizes ) TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const int localSize = this->distributedArray.getLocalView().getSize(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedArray.getSize(), this->globalSize ); } diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index 5a201980c..a90f09506 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -11,7 +11,6 @@ #include -#include #include #include #include @@ -21,6 +20,7 @@ using namespace TNL; using namespace TNL::Containers; +using namespace TNL::MPI; /* * Light check of DistributedVector. @@ -36,21 +36,20 @@ class DistributedVectorTest protected: using RealType = typename DistributedVector::RealType; using DeviceType = typename DistributedVector::DeviceType; - using CommunicatorType = typename DistributedVector::CommunicatorType; using IndexType = typename DistributedVector::IndexType; using DistributedVectorType = DistributedVector; using VectorViewType = typename DistributedVectorType::LocalViewType; - using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >; + using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >; using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >; - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); DistributedVectorType v; DistributedVectorView v_view; HostDistributedVectorType v_host; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // should be small enough to have fast tests, but large enough to test // scan with multiple CUDA grids @@ -62,11 +61,11 @@ protected: DistributedVectorTest() { using LocalRangeType = typename DistributedVector::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); v.setDistribution( localRange, ghosts, globalSize, group ); - using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >; - using HostSynchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< Devices::Sequential >; + using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >; + using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >; v.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); v_view.setSynchronizer( v.getSynchronizer() ); v_host.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); @@ -78,10 +77,10 @@ protected: // types for which DistributedVectorTest is instantiated using DistributedVectorTypes = ::testing::Types< - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Host, int > #ifdef HAVE_CUDA , - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Cuda, int > #endif >; diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index b659beaea..b79b675cf 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -13,10 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -61,16 +61,13 @@ protected: using RightReal = std::remove_const_t< typename Right::RealType >; #ifndef STATIC_VECTOR #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename Left::CommunicatorType; - static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value, - "CommunicatorType must be the same for both Left and Right vectors." ); - using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >; - using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >; + using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType >; + using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType >; - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // some arbitrary value (but must be 0 if not distributed) const int ghosts = (nproc > 1) ? 4 : 0; @@ -98,8 +95,8 @@ protected: #else #ifdef DISTRIBUTED_VECTOR using LocalRangeType = typename LeftVector::LocalRangeType; - using Synchronizer = typename Partitioner< typename Left::IndexType, CommunicatorType >::template ArraySynchronizer< typename Left::DeviceType >; - const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group ); + using Synchronizer = typename Partitioner< typename Left::IndexType >::template ArraySynchronizer< typename Left::DeviceType >; + const LocalRangeType localRange = Partitioner< typename Left::IndexType >::splitRange( size, group ); _L1.setDistribution( localRange, ghosts, size, group ); _L2.setDistribution( localRange, ghosts, size, group ); @@ -160,23 +157,23 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorPairs = ::testing::Types< #ifndef HAVE_CUDA - Pair< DistributedVector< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > > + Pair< DistributedVector< int, Devices::Host, int >, + DistributedVector< short, Devices::Host, int > >, + Pair< DistributedVector< int, Devices::Host, int >, + DistributedVectorView< short, Devices::Host, int > >, + Pair< DistributedVectorView< int, Devices::Host, int >, + DistributedVector< short, Devices::Host, int > >, + Pair< DistributedVectorView< int, Devices::Host, int >, + DistributedVectorView< short, Devices::Host, int > > #else - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > > + Pair< DistributedVector< int, Devices::Cuda, int >, + DistributedVector< short, Devices::Cuda, int > >, + Pair< DistributedVector< int, Devices::Cuda, int >, + DistributedVectorView< short, Devices::Cuda, int > >, + Pair< DistributedVectorView< int, Devices::Cuda, int >, + DistributedVector< short, Devices::Cuda, int > >, + Pair< DistributedVectorView< int, Devices::Cuda, int >, + DistributedVectorView< short, Devices::Cuda, int > > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h index b7e8a1b95..32f2d52ba 100644 --- a/src/UnitTests/Containers/VectorHelperFunctions.h +++ b/src/UnitTests/Containers/VectorHelperFunctions.h @@ -2,6 +2,7 @@ #include #include +#include template< typename Vector > void setLinearSequence( Vector& deviceVector ) diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index 27422513b..485265e4e 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -13,10 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -51,15 +51,14 @@ protected: #else using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename VectorOrView::CommunicatorType; - using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > - using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // some arbitrary even value (but must be 0 if not distributed) const int ghosts = (nproc > 1) ? 4 : 0; @@ -75,13 +74,13 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorTypes = ::testing::Types< #ifndef HAVE_CUDA - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Host, int >, + DistributedVectorView< double, Devices::Host, int >, + DistributedVectorView< const double, Devices::Host, int > #else - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Cuda, int >, + DistributedVectorView< double, Devices::Cuda, int >, + DistributedVectorView< const double, Devices::Cuda, int > #endif >; #elif defined(STATIC_VECTOR) @@ -174,10 +173,9 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); #define SETUP_UNARY_VECTOR_TEST( size ) \ using VectorType = typename TestFixture::VectorType; \ using VectorOrView = typename TestFixture::VectorOrView; \ - using CommunicatorType = typename VectorOrView::CommunicatorType; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \ - using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ VectorType _V1, _V2; \ _V1.setDistribution( localRange, this->ghosts, size, this->group ); \ @@ -199,10 +197,9 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); EXPECTED_VECTOR( TestFixture, function ); \ using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \ using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \ - using CommunicatorType = typename VectorOrView::CommunicatorType; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \ - using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ HostVector _V1h; \ HostExpectedVector expected_h; \ @@ -282,11 +279,8 @@ void expect_vectors_near( const Left& _v1, const Right& _v2 ) using LeftNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Left::RealType > >; using RightNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Right::RealType > >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename Left::CommunicatorType; - static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value, - "CommunicatorType must be the same for both Left and Right vectors." ); - using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >; - using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >; + using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >; + using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >; #else using LeftVector = Vector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >; using RightVector = Vector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >; diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index 4ad0c8303..f73b502cc 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -13,10 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -52,15 +52,14 @@ protected: #else using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename VectorOrView::CommunicatorType; - using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > - using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // some arbitrary value (but must be 0 if not distributed) const int ghosts = (nproc > 1) ? 4 : 0; @@ -84,8 +83,8 @@ protected: #else #ifdef DISTRIBUTED_VECTOR using LocalRangeType = typename VectorOrView::LocalRangeType; - using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, group ); _V1.setDistribution( localRange, ghosts, size, group ); _V1.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); #else @@ -111,13 +110,13 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorTypes = ::testing::Types< #ifndef HAVE_CUDA - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Host, int >, + DistributedVectorView< double, Devices::Host, int >, + DistributedVectorView< const double, Devices::Host, int > #else - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator > + DistributedVector< double, Devices::Cuda, int >, + DistributedVectorView< double, Devices::Cuda, int >, + DistributedVectorView< const double, Devices::Cuda, int > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h index 4cc584672..5e893e111 100644 --- a/src/UnitTests/Matrices/DistributedMatrixTest.h +++ b/src/UnitTests/Matrices/DistributedMatrixTest.h @@ -9,12 +9,12 @@ #ifdef HAVE_GTEST #include -#include #include #include #include using namespace TNL; +using namespace TNL::MPI; template< typename Vector > void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = 0 ) @@ -32,7 +32,7 @@ void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = template< typename Matrix, typename RowCapacities > void setMatrix( Matrix& matrix, const RowCapacities& rowCapacities ) { - using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential >, typename Matrix::CommunicatorType >; + using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential > >; using HostRowCapacities = typename RowCapacities::template Self< typename RowCapacities::RealType, TNL::Devices::Sequential >; HostMatrix hostMatrix; @@ -65,20 +65,19 @@ class DistributedMatrixTest protected: using RealType = typename DistributedMatrix::RealType; using DeviceType = typename DistributedMatrix::DeviceType; - using CommunicatorType = typename DistributedMatrix::CommunicatorType; using IndexType = typename DistributedMatrix::IndexType; using DistributedMatrixType = DistributedMatrix; using RowCapacitiesVector = typename DistributedMatrixType::CompressedRowLengthsVector; using GlobalVector = Containers::Vector< RealType, DeviceType, IndexType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); DistributedMatrixType matrix; @@ -87,7 +86,7 @@ protected: DistributedMatrixTest() { using LocalRangeType = typename DistributedMatrix::LocalRangeType; - const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( globalSize, group ); matrix.setDistribution( localRange, globalSize, globalSize, group ); rowCapacities.setDistribution( localRange, 0, globalSize, group ); @@ -100,10 +99,10 @@ protected: // types for which DistributedMatrixTest is instantiated using DistributedMatrixTypes = ::testing::Types< - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int > > #ifdef HAVE_CUDA , - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int > > #endif >; @@ -111,11 +110,9 @@ TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes ); TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const int localSize = this->matrix.getLocalMatrix().getRows(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->matrix.getRows(), this->globalSize ); } -- GitLab From 6f74d8fa517830bda204e209fe2946bd16920ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 2 Jan 2021 10:48:06 +0100 Subject: [PATCH 44/50] MPI refactoring: removed MpiCommunicator from DistributedMesh Also from DistributedMeshSynchronizer, PVTUReader and PVTUWriter --- src/TNL/Containers/ByteArraySynchronizer.h | 6 +- .../DistributedMeshes/DistributedMesh.h | 20 +++--- .../DistributedMeshSynchronizer.h | 52 +++++++-------- .../DistributedMeshes/distributeSubentities.h | 65 +++++++++---------- src/TNL/Meshes/Readers/PVTUReader.h | 13 ++-- src/TNL/Meshes/Writers/PVTUWriter.h | 3 +- src/TNL/Meshes/Writers/PVTUWriter.hpp | 7 +- src/Tools/tnl-game-of-life.cpp | 20 +++--- src/Tools/tnl-test-distributed-mesh.h | 22 +++---- .../DistributedMeshes/DistributedMeshTest.h | 56 ++++++++-------- 10 files changed, 124 insertions(+), 140 deletions(-) diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h index e25260909..0bfed4d92 100644 --- a/src/TNL/Containers/ByteArraySynchronizer.h +++ b/src/TNL/Containers/ByteArraySynchronizer.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include namespace TNL { @@ -42,7 +42,7 @@ private: public: using ByteArrayView = ArrayView< std::uint8_t, Device, Index >; - using RequestsVector = std::vector< typename Communicators::MpiCommunicator::Request >; + using RequestsVector = std::vector< MPI_Request >; enum class AsyncPolicy { synchronous, @@ -105,7 +105,7 @@ public: // immediate start, deferred synchronization (but still in the same thread) auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); auto worker = [requests] () mutable { - Communicators::MpiCommunicator::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); }; this->async_op = std::async( std::launch::deferred, worker ); } diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h index 9a79f823d..21116d357 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h @@ -13,7 +13,7 @@ #pragma once #include -#include +#include #include #include @@ -34,8 +34,6 @@ public: using PointType = typename Mesh::PointType; using RealType = typename PointType::RealType; using GlobalIndexArray = typename Mesh::GlobalIndexArray; - using CommunicatorType = Communicators::MpiCommunicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using VTKTypesArrayType = Containers::Array< std::uint8_t, Devices::Sequential, GlobalIndexType >; DistributedMesh() = default; @@ -101,12 +99,12 @@ public: /** * Methods specific to the distributed mesh */ - void setCommunicationGroup( CommunicationGroup group ) + void setCommunicationGroup( MPI_Comm group ) { this->group = group; } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -190,10 +188,10 @@ public: const GlobalIndexType verticesCount = localMesh.template getEntitiesCount< 0 >(); const GlobalIndexType cellsCount = localMesh.template getEntitiesCount< Mesh::getMeshDimension() >(); - CommunicatorType::Barrier(); - for( int i = 0; i < CommunicatorType::GetSize(); i++ ) { - if( i == CommunicatorType::GetRank() ) { - str << "MPI rank:\t" << CommunicatorType::GetRank() << "\n" + MPI::Barrier(); + for( int i = 0; i < MPI::GetSize(); i++ ) { + if( i == MPI::GetRank() ) { + str << "MPI rank:\t" << MPI::GetRank() << "\n" << "\tMesh dimension:\t" << getMeshDimension() << "\n" << "\tCell topology:\t" << getType( typename Cell::EntityTopology{} ) << "\n" << "\tCells count:\t" << cellsCount << "\n" @@ -230,13 +228,13 @@ public: } str.flush(); } - CommunicatorType::Barrier(); + MPI::Barrier(); } } protected: MeshType localMesh; - CommunicationGroup group = CommunicatorType::NullGroup; + MPI_Comm group = MPI::NullGroup(); int ghostLevels = 0; // vtkGhostType arrays for points and cells (cached for output into VTK formats) diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h index 382de6905..36f28ba45 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace TNL { namespace Meshes { @@ -40,7 +41,6 @@ class DistributedMeshSynchronizer public: using DeviceType = typename DistributedMesh::DeviceType; using GlobalIndexType = typename DistributedMesh::GlobalIndexType; - using CommunicatorType = typename DistributedMesh::CommunicatorType; using ByteArrayView = typename Base::ByteArrayView; using RequestsVector = typename Base::RequestsVector; @@ -61,8 +61,8 @@ public: "Global indices are not allocated properly." ); group = mesh.getCommunicationGroup(); - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // exchange the global index offsets so that each rank can determine the // owner of every entity by its global index @@ -71,9 +71,9 @@ public: { Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( ownStart ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + group ); } // count local ghost entities for each rank @@ -110,9 +110,9 @@ public: for( int j = 0; j < nproc; j++ ) for( int i = 0; i < nproc; i++ ) sendbuf.setElement( j, i, localGhostCounts[ i ] ); - CommunicatorType::Alltoall( &sendbuf(0, 0), nproc, - &ghostEntitiesCounts(0, 0), nproc, - group ); + MPI::Alltoall( &sendbuf(0, 0), nproc, + &ghostEntitiesCounts(0, 0), nproc, + group ); } // allocate ghost offsets @@ -136,7 +136,7 @@ public: ghostOffsets[ 0 ] = ghostOffset; for( int i = 0; i < nproc; i++ ) { if( ghostEntitiesCounts( rank, i ) > 0 ) { - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( mesh.template getGlobalIndices< EntityDimension >().getData() + ghostOffset, ghostEntitiesCounts( rank, i ), i, 0, group ) ); @@ -151,7 +151,7 @@ public: // receive ghost indices from the neighboring ranks for( int j = 0; j < nproc; j++ ) { if( ghostEntitiesCounts( j, rank ) > 0 ) { - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( ghostNeighbors.getData() + ghostNeighborOffsets[ j ], ghostEntitiesCounts( j, rank ), j, 0, group ) ); @@ -159,7 +159,7 @@ public: } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); // convert received ghost indices from global to local ghostNeighbors -= ownStart; @@ -201,7 +201,7 @@ public: virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override { auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); } virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override @@ -209,8 +209,8 @@ public: TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ], "The array does not have the expected size." ); - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // allocate send buffers (setSize does nothing if the array size is already correct) sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] ); @@ -221,7 +221,7 @@ public: // issue all receive async operations for( int j = 0; j < nproc; j++ ) { if( ghostEntitiesCounts( rank, j ) > 0 ) { - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( array.getData() + bytesPerValue * ghostOffsets[ j ], bytesPerValue * ghostEntitiesCounts( rank, j ), j, 0, group ) ); @@ -245,7 +245,7 @@ public: Algorithms::ParallelFor< DeviceType >::exec( (GlobalIndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset ); // issue async send operation - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ], bytesPerValue * ghostEntitiesCounts( i, rank ), i, 0, group ) ); @@ -268,8 +268,8 @@ public: { TNL_ASSERT_EQ( pattern.getRows(), ghostOffsets[ ghostOffsets.getSize() - 1 ], "invalid sparse pattern matrix" ); - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // buffer for asynchronous communication requests RequestsVector requests; @@ -306,7 +306,7 @@ public: // send our row sizes to the target rank if( ! assumeConsistentRowCapacities ) // issue async send operation - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( send_rowCapacities.getData() + send_rankOffsets[ i ], ghostNeighborOffsets[ i + 1 ] - ghostNeighborOffsets[ i ], i, 1, group ) ); @@ -334,7 +334,7 @@ public: if( send_rankOffsets[ i + 1 ] == send_rankOffsets[ i ] ) continue; // issue async send operation - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( send_columnIndices.getData() + send_rowPointers[ send_rankOffsets[ i ] ], send_rowPointers[ send_rankOffsets[ i + 1 ] ] - send_rowPointers[ send_rankOffsets[ i ] ], i, 0, group ) ); @@ -369,7 +369,7 @@ public: else { // receive row sizes from the sender // issue async recv operation - row_lengths_requests.push_back( CommunicatorType::IRecv( + row_lengths_requests.push_back( MPI::Irecv( recv_rowPointers.getData() + recv_rankOffsets[ i ], ghostOffsets[ i + 1 ] - ghostOffsets[ i ], i, 1, group ) ); @@ -378,7 +378,7 @@ public: if( ! assumeConsistentRowCapacities ) { // wait for all row lengths - CommunicatorType::WaitAll( row_lengths_requests.data(), row_lengths_requests.size() ); + MPI::Waitall( row_lengths_requests.data(), row_lengths_requests.size() ); // scan the rowPointers array to convert Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView; @@ -393,7 +393,7 @@ public: if( recv_rankOffsets[ i + 1 ] == recv_rankOffsets[ i ] ) continue; // issue async recv operation - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( recv_columnIndices.getData() + recv_rowPointers[ recv_rankOffsets[ i ] ], recv_rowPointers[ recv_rankOffsets[ i + 1 ] ] - recv_rowPointers[ recv_rankOffsets[ i ] ], i, 0, group ) ); @@ -401,7 +401,7 @@ public: } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return std::make_tuple( recv_rankOffsets, recv_rowPointers, recv_columnIndices ); } @@ -445,7 +445,7 @@ public: protected: // communication group taken from the distributed mesh - typename CommunicatorType::CommunicationGroup group; + MPI_Comm group; /** * Global offsets: array of size nproc where the i-th value is the lowest diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h index 63a10b1cf..120cadf80 100644 --- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h +++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h @@ -19,14 +19,14 @@ namespace TNL { namespace Meshes { namespace DistributedMeshes { -template< typename CommunicatorType, typename GlobalIndexType > +template< typename GlobalIndexType > auto -exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, +exchangeGhostEntitySeeds( MPI_Comm group, const std::vector< std::vector< GlobalIndexType > >& seeds_vertex_indices, const std::vector< std::vector< GlobalIndexType > >& seeds_entity_offsets ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // exchange sizes of the arrays Containers::Array< GlobalIndexType, Devices::Host, int > sizes_vertex_indices( nproc ), sizes_entity_offsets( nproc ); @@ -36,12 +36,12 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, sendbuf_indices[ i ] = seeds_vertex_indices[ i ].size(); sendbuf_offsets[ i ] = seeds_entity_offsets[ i ].size(); } - CommunicatorType::Alltoall( sendbuf_indices.getData(), 1, - sizes_vertex_indices.getData(), 1, - group ); - CommunicatorType::Alltoall( sendbuf_offsets.getData(), 1, - sizes_entity_offsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf_indices.getData(), 1, + sizes_vertex_indices.getData(), 1, + group ); + MPI::Alltoall( sendbuf_offsets.getData(), 1, + sizes_entity_offsets.getData(), 1, + group ); } // allocate arrays for the results @@ -54,17 +54,17 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, } // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + std::vector< MPI_Request > requests; // issue all async receive operations for( int j = 0; j < nproc; j++ ) { if( j == rank ) continue; - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( foreign_seeds_vertex_indices[ j ].data(), foreign_seeds_vertex_indices[ j ].size(), j, 0, group ) ); - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( foreign_seeds_entity_offsets[ j ].data(), foreign_seeds_entity_offsets[ j ].size(), j, 1, group ) ); @@ -74,30 +74,30 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, for( int i = 0; i < nproc; i++ ) { if( i == rank ) continue; - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( seeds_vertex_indices[ i ].data(), seeds_vertex_indices[ i ].size(), i, 0, group ) ); - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( seeds_entity_offsets[ i ].data(), seeds_entity_offsets[ i ].size(), i, 1, group ) ); } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return std::make_tuple( foreign_seeds_vertex_indices, foreign_seeds_entity_offsets ); } -template< typename CommunicatorType, typename GlobalIndexType > +template< typename GlobalIndexType > auto -exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, +exchangeGhostIndices( MPI_Comm group, const std::vector< std::vector< GlobalIndexType > >& foreign_ghost_indices, const std::vector< std::vector< GlobalIndexType > >& seeds_local_indices ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // allocate arrays for the results std::vector< std::vector< GlobalIndexType > > ghost_indices; @@ -106,13 +106,13 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, ghost_indices[ i ].resize( seeds_local_indices[ i ].size() ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + std::vector< MPI_Request > requests; // issue all async receive operations for( int j = 0; j < nproc; j++ ) { if( j == rank ) continue; - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( ghost_indices[ j ].data(), ghost_indices[ j ].size(), j, 0, group ) ); @@ -122,14 +122,14 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, for( int i = 0; i < nproc; i++ ) { if( i == rank ) continue; - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( foreign_ghost_indices[ i ].data(), foreign_ghost_indices[ i ].size(), i, 0, group ) ); } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return ghost_indices; } @@ -145,7 +145,6 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) using GlobalIndexType = typename DistributedMesh::GlobalIndexType; using LocalIndexType = typename DistributedMesh::LocalIndexType; using LocalMesh = typename DistributedMesh::MeshType; - using CommunicatorType = typename DistributedMesh::CommunicatorType; static_assert( ! std::is_same< DeviceType, Devices::Cuda >::value, "this method can be called only for host meshes" ); @@ -154,8 +153,8 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) if( mesh.getGhostLevels() <= 0 ) throw std::logic_error( "There are no ghost levels on the distributed mesh." ); - const int rank = CommunicatorType::GetRank( mesh.getCommunicationGroup() ); - const int nproc = CommunicatorType::GetSize( mesh.getCommunicationGroup() ); + const int rank = MPI::GetRank( mesh.getCommunicationGroup() ); + const int nproc = MPI::GetSize( mesh.getCommunicationGroup() ); // 0. exchange cell data to prepare getCellOwner for use in getEntityOwner DistributedMeshSynchronizer< DistributedMesh, DistributedMesh::getMeshDimension() > cell_synchronizer; @@ -235,9 +234,9 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( localEntitiesCount ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - mesh.getCommunicationGroup() ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + mesh.getCommunicationGroup() ); } globalOffsets.template scan< Algorithms::ScanType::Exclusive >(); @@ -288,7 +287,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) } // 5. exchange seeds for ghost entities - const auto foreign_seeds = exchangeGhostEntitySeeds< CommunicatorType >( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets ); + const auto foreign_seeds = exchangeGhostEntitySeeds( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets ); const auto& foreign_seeds_vertex_indices = std::get< 0 >( foreign_seeds ); const auto& foreign_seeds_entity_offsets = std::get< 1 >( foreign_seeds ); @@ -373,7 +372,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) }); // 6b. exchange global ghost indices - const auto ghost_indices = exchangeGhostIndices< CommunicatorType >( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices ); + const auto ghost_indices = exchangeGhostIndices( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices ); // 6c. set the global indices of our ghost entities bool done = true; @@ -387,7 +386,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) // 6d. check if finished bool all_done = false; - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); if( all_done ) break; } diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h index 393ee1551..725aa7fec 100644 --- a/src/TNL/Meshes/Readers/PVTUReader.h +++ b/src/TNL/Meshes/Readers/PVTUReader.h @@ -14,7 +14,7 @@ #include -#include +#include #include #include @@ -67,13 +67,13 @@ class PVTUReader throw MeshReaderError( "PVTUReader", "the file does not contain any element." ); // check that the number of pieces matches the number of MPI ranks - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); if( (int) pieceSources.size() != nproc ) throw MeshReaderError( "PVTUReader", "the number of subdomains does not match the number of MPI ranks (" + std::to_string(pieceSources.size()) + " vs " + std::to_string(nproc) + ")." ); // read the local piece source - const int rank = CommunicatorType::GetRank( group ); + const int rank = MPI::GetRank( group ); localReader.setFileName( pieceSources[ rank ] ); localReader.detectMesh(); @@ -100,12 +100,9 @@ class PVTUReader #endif public: - using CommunicatorType = Communicators::MpiCommunicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; - PVTUReader() = default; - PVTUReader( const std::string& fileName, CommunicationGroup group = CommunicatorType::AllGroup ) + PVTUReader( const std::string& fileName, MPI_Comm group = MPI::AllGroup() ) : XMLVTK( fileName ), group( group ) {} @@ -233,7 +230,7 @@ public: } protected: - CommunicationGroup group; + MPI_Comm group; int ghostLevels = 0; int minCommonVertices = 0; diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h index 5aa9cd2b0..2f332d20e 100644 --- a/src/TNL/Meshes/Writers/PVTUWriter.h +++ b/src/TNL/Meshes/Writers/PVTUWriter.h @@ -65,9 +65,8 @@ public: // add all pieces and return the source path for the current rank // (useful for parallel writing) - template< typename Communicator > std::string addPiece( const String& mainFileName, - const typename Communicator::CommunicationGroup group ); + const MPI_Comm group ); ~PVTUWriter(); diff --git a/src/TNL/Meshes/Writers/PVTUWriter.hpp b/src/TNL/Meshes/Writers/PVTUWriter.hpp index 71e19da1d..affee65a2 100644 --- a/src/TNL/Meshes/Writers/PVTUWriter.hpp +++ b/src/TNL/Meshes/Writers/PVTUWriter.hpp @@ -137,15 +137,14 @@ PVTUWriter< Mesh >::addPiece( const String& mainFileName, } template< typename Mesh > - template< typename Communicator > std::string PVTUWriter< Mesh >::addPiece( const String& mainFileName, - const typename Communicator::CommunicationGroup group ) + const MPI_Comm group ) { std::string source; - for( int i = 0; i < Communicator::GetSize( group ); i++ ) { + for( int i = 0; i < MPI::GetSize( group ); i++ ) { const std::string s = addPiece( mainFileName, i ); - if( i == Communicator::GetRank( group ) ) + if( i == MPI::GetRank( group ) ) source = s; } return source; diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp index a2d4f48e9..7003489ab 100644 --- a/src/Tools/tnl-game-of-life.cpp +++ b/src/Tools/tnl-game-of-life.cpp @@ -17,13 +17,11 @@ #include #include #include -#include #include +#include using namespace TNL; -using CommunicatorType = Communicators::MpiCommunicator; - struct MyConfigTag {}; namespace TNL { @@ -198,8 +196,8 @@ bool runGameOfLife( const Mesh& mesh ) } } Index max_count; - CommunicatorType::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() ); - std::cout << "Rank " << CommunicatorType::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl; + TNL::MPI::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() ); + std::cout << "Rank " << TNL::MPI::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl; Index reference_cell = 0; if( count == max_count ) { // find cell which has all points in the central box @@ -256,7 +254,7 @@ bool runGameOfLife( const Mesh& mesh ) // create a .pvtu file (only rank 0 actually writes to the file) const std::string mainFilePath = "GoL." + std::to_string(iteration) + ".pvtu"; std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -266,7 +264,7 @@ bool runGameOfLife( const Mesh& mesh ) if( mesh.getGhostLevels() > 0 ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< Real >( "function values" ); - const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -292,7 +290,7 @@ bool runGameOfLife( const Mesh& mesh ) Index iteration = 0; do { iteration++; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) std::cout << "Computing iteration " << iteration << "..." << std::endl; // iterate over all local entities @@ -338,7 +336,7 @@ bool runGameOfLife( const Mesh& mesh ) // check if finished const bool done = max( f_in.getData() ) == 0 || iteration > max_iter || f_in.getData() == f_out.getData(); - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); } while( all_done == false ); @@ -351,7 +349,7 @@ void configSetup( Config::ConfigDescription& config ) config.addRequiredEntry< String >( "input-file", "Input file with the mesh." ); config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" ); config.addDelimiter( "MPI settings:" ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int main( int argc, char* argv[] ) @@ -366,7 +364,7 @@ int main( int argc, char* argv[] ) if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; - if( ! CommunicatorType::setup( parameters ) ) + if( ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String inputFileName = parameters.getParameter< String >( "input-file" ); diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h index 1b8c59c75..6b748d993 100644 --- a/src/Tools/tnl-test-distributed-mesh.h +++ b/src/Tools/tnl-test-distributed-mesh.h @@ -18,13 +18,11 @@ #include #include #include -#include #include +#include using namespace TNL; -using CommunicatorType = Communicators::MpiCommunicator; - struct MyConfigTag {}; namespace TNL { @@ -214,7 +212,7 @@ void testSynchronizerOnDevice( const MeshType& mesh ) if( received != center ) { IndexType cellIndexes[ 2 ] = {0, 0}; const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes ); - std::cerr << "rank " << CommunicatorType::GetRank() + std::cerr << "rank " << TNL::MPI::GetRank() << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")" << " of dimension = " << EntityType::getEntityDimension() << ": received " << received << ", expected = " << center @@ -224,7 +222,7 @@ void testSynchronizerOnDevice( const MeshType& mesh ) } } if( errors > 0 ) { - std::cerr << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl; + std::cerr << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl; TNL_ASSERT_TRUE( false, "test failed" ); } } @@ -273,7 +271,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) // create a .pvtu file (only rank 0 actually writes to the file) const std::string mainFilePath = "data_" + std::to_string(iteration) + ".pvtu"; std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -284,7 +282,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< Real >( "function values" ); pvtu.template writePCellData< Real >( "test values" ); - const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -315,7 +313,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) int iteration = 0; do { iteration++; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) std::cout << "Computing iteration " << iteration << "..." << std::endl; const Index prev_sum = sum( f_K.getData() ); @@ -400,14 +398,14 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) std::cerr << "ERROR: propatation over faces differs from the propagation over neighbor cells. Differing values are:\n"; for( Index K = 0; K < f_K_view.getSize(); K++ ) if( f_K_view[ K ] != f_K_test_view[ K ] ) - std::cerr << " rank = " << CommunicatorType::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n"; + std::cerr << " rank = " << TNL::MPI::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n"; std::cerr.flush(); TNL_ASSERT_TRUE( false, "test failed" ); } // check if finished const bool done = sum( f_K.getData() ) == prev_sum || iteration > max_iterations; - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); } while( all_done == false ); @@ -421,7 +419,7 @@ void configSetup( Config::ConfigDescription& config ) config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" ); config.addEntry< int >( "max-iterations", "Maximum number of iterations to compute", 100 ); config.addDelimiter( "MPI settings:" ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int main( int argc, char* argv[] ) @@ -436,7 +434,7 @@ int main( int argc, char* argv[] ) if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; - if( ! CommunicatorType::setup( parameters ) ) + if( ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String inputFileName = parameters.getParameter< String >( "input-file" ); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h index b778937b6..a0eddd162 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -32,9 +31,6 @@ using namespace TNL::Meshes::DistributedMeshes; // cannot be deduced from the grid using LocalIndexType = short int; -// we test only with MPI -using CommunicatorType = Communicators::MpiCommunicator; -using CommunicationGroup = typename CommunicatorType::CommunicationGroup; template< typename Mesh > struct GridDistributor; @@ -54,9 +50,9 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > > GridDistributor() = delete; - GridDistributor( CoordinatesType rank_sizes, CommunicationGroup group ) - : rank(CommunicatorType::GetRank(group)), - nproc(CommunicatorType::GetSize(group)), + GridDistributor( CoordinatesType rank_sizes, MPI_Comm group ) + : rank(TNL::MPI::GetRank(group)), + nproc(TNL::MPI::GetSize(group)), rank_sizes(rank_sizes), group(group) {} @@ -328,7 +324,7 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > > // input parameters int rank, nproc; CoordinatesType rank_sizes; - CommunicationGroup group; + MPI_Comm group; // output attributes (byproduct of the decomposition, useful for testing) CoordinatesType rank_coordinates, local_size, vert_begin, vert_end, cell_begin, cell_end; Index verticesCount, cellsCount, localVerticesCount, localCellsCount; @@ -341,7 +337,7 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe using Device = typename Mesh::DeviceType; // check basic interface - EXPECT_EQ( mesh.getCommunicationGroup(), CommunicatorType::AllGroup ); + EXPECT_EQ( mesh.getCommunicationGroup(), TNL::MPI::AllGroup() ); EXPECT_EQ( mesh.getGhostLevels(), ghostLevels ); if( ghostLevels > 0 ) { EXPECT_EQ( mesh.template getGlobalIndices< 0 >().getSize(), mesh.getLocalMesh().template getEntitiesCount< 0 >() ); @@ -398,12 +394,12 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe Containers::Array< Index, Device > vert_sendbuf( distributor.nproc ), cell_sendbuf( distributor.nproc ); vert_sendbuf.setValue( distributor.localVerticesCount ); cell_sendbuf.setValue( distributor.localCellsCount ); - CommunicatorType::Alltoall( vert_sendbuf.getData(), 1, - vert_offsets.getData(), 1, - distributor.group ); - CommunicatorType::Alltoall( cell_sendbuf.getData(), 1, - cell_offsets.getData(), 1, - distributor.group ); + TNL::MPI::Alltoall( vert_sendbuf.getData(), 1, + vert_offsets.getData(), 1, + distributor.group ); + TNL::MPI::Alltoall( cell_sendbuf.getData(), 1, + cell_offsets.getData(), 1, + distributor.group ); } vert_offsets.setElement( distributor.nproc, 0 ); cell_offsets.setElement( distributor.nproc, 0 ); @@ -661,7 +657,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh ) if( received != center ) { IndexType cellIndexes[ 2 ] = {0, 0}; const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes ); - std::cerr << "rank " << CommunicatorType::GetRank() + std::cerr << "rank " << TNL::MPI::GetRank() << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")" << " of dimension = " << EntityType::getEntityDimension() << ": received " << received << ", expected = " << center @@ -671,7 +667,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh ) } } if( errors > 0 ) - FAIL() << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl; + FAIL() << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl; } template< typename Device, typename EntityType, typename MeshType > @@ -703,10 +699,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel0 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 0; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -720,10 +716,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel1 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 1; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -738,10 +734,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel2 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 2; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -756,10 +752,10 @@ TEST( DistributedMeshTest, PVTUWriterReader ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 2; distributor.decompose( grid, mesh, ghostLevels ); @@ -769,7 +765,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) std::string subfilePath; { std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -780,7 +776,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< typename Mesh::GlobalIndexType >( "GlobalIndex" ); } - subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -798,7 +794,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) } // load and test - CommunicatorType::Barrier(); + TNL::MPI::Barrier(); Readers::PVTUReader reader( mainFilePath ); reader.detectMesh(); EXPECT_EQ( reader.getMeshType(), "Meshes::DistributedMesh" ); @@ -812,8 +808,8 @@ TEST( DistributedMeshTest, PVTUWriterReader ) // cleanup EXPECT_EQ( fs::remove( subfilePath ), true ); - CommunicatorType::Barrier(); - if( CommunicatorType::GetRank() == 0 ) { + TNL::MPI::Barrier(); + if( TNL::MPI::GetRank() == 0 ) { EXPECT_EQ( fs::remove( mainFilePath ), true ); EXPECT_EQ( fs::remove( baseName ), true ); } -- GitLab From 037c825547d5f7efe933c5d5abf45cfc212de317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 2 Jan 2021 14:50:20 +0100 Subject: [PATCH 45/50] MPI refactoring: removed MpiCommunicator from Python bindings --- src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp | 3 +-- src/Python/pytnl/tnl_mpi/tnl_mpi.cpp | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp index 089d59adf..17bf57c12 100644 --- a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp @@ -78,8 +78,7 @@ void export_DistributedMeshWriter( py::module & m, const char* name ) }, py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) // NOTE: only the overload intended for sequential writing is exported, because we don't - // have type casters for Communicators::MpiCommunicator::CommunicationGroup - // (ideally, the communication group would be compatible with the mpi4py objects) + // have type casters for MPI_Comm (ideally, it would be compatible with the mpi4py objects) .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ), py::arg("mainFileName"), py::arg("subdomainIndex")) ; diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp index a12060600..a422795b6 100644 --- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -3,6 +3,7 @@ // conversions have to be registered for each object file #include "../tnl_conversions.h" +#include "TNL/MPI/Wrappers.h" // external functions void export_DistributedMeshes( py::module & m ); @@ -18,15 +19,15 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m) // MPI initialization and finalization // https://stackoverflow.com/q/64647846 - if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) { + if( ! TNL::MPI::Initialized() ) { int argc = 0; char** argv = nullptr; - TNL::Communicators::MpiCommunicator::Init( argc, argv ); + TNL::MPI::Init( argc, argv ); } // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors auto cleanup_callback = []() { - if( TNL::Communicators::MpiCommunicator::IsInitialized() ) - TNL::Communicators::MpiCommunicator::Finalize(); + if( TNL::MPI::Initialized() && ! TNL::MPI::Finalized() ) + TNL::MPI::Finalize(); }; m.add_object("_cleanup", py::capsule(cleanup_callback)); -- GitLab From db5c4615096ecafd276f93093a7dd697e7715ce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 2 Jan 2021 15:07:52 +0100 Subject: [PATCH 46/50] MPI refactoring: removed MpiCommunicator from the distributed grid and related classes --- .../tnlDirectEikonalProblem_impl.h | 12 +- .../tnlFastSweepingMethod2D_impl.h | 188 ++++++------- .../tnlFastSweepingMethod3D_impl.h | 230 +++++++-------- src/TNL/Functions/CutMeshFunction.h | 23 +- .../DistributedMeshes/DistributedGrid.h | 81 +++--- .../DistributedMeshes/DistributedGrid.hpp | 153 +++++----- .../DistributedMeshes/DistributedGridIO.h | 7 +- .../DistributedGridIO_MeshFunction.h | 28 +- .../DistributedGridIO_VectorField.h | 44 +-- .../DistributedGridSynchronizer.h | 18 +- .../SubdomainOverlapsGetter.h | 49 ++-- .../SubdomainOverlapsGetter.hpp | 57 ++-- .../DistributedMeshes/loadDistributedMesh.h | 9 +- src/TNL/Problems/HeatEquationProblem_impl.h | 12 +- src/TNL/Problems/PDEProblem_impl.h | 6 +- .../Solvers/PDE/TimeDependentPDESolver_impl.h | 2 +- .../PDE/TimeIndependentPDESolver_impl.h | 2 +- src/TNL/Solvers/SolverStarter_impl.h | 12 +- src/TNL/Solvers/Solver_impl.h | 4 +- src/Tools/tnl-init.cpp | 4 +- src/Tools/tnl-init.h | 17 +- .../CutDistributedGridTest.cpp | 116 ++++---- .../CutDistributedMeshFunctionTest.cpp | 65 ++--- .../DistributedMeshes/CutMeshFunctionTest.cpp | 58 ++-- .../DistributedMeshes/DistributedGridIOTest.h | 24 +- .../DistributedGridIO_MPIIOTest.h | 22 +- .../DistributedGridTest_1D.cpp | 89 +++--- .../DistributedGridTest_2D.cpp | 263 +++++++++--------- .../DistributedGridTest_3D.cpp | 73 +++-- .../DistributedVectorFieldIO_MPIIOTestBase.h | 64 ++--- 30 files changed, 828 insertions(+), 904 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 7bfeb4976..3e1ea757b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod_impl.h * Author: oberhuber * @@ -25,7 +25,7 @@ String tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: getType() { - return String( "DirectEikonalProblem< " + + return String( "DirectEikonalProblem< " + Mesh::getType() + ", " + Anisotropy::getType() + ", " + Real::getType() + ", " + @@ -54,7 +54,7 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: writeProlog( Logger& logger, const Config::ParameterContainer& parameters ) const { - + } template< typename Mesh, @@ -123,7 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( dofs ); String inputFile = parameters.getParameter< String >( "input-file" ); - this->initialData->setMesh( this->getMesh() ); + this->initialData->setMesh( this->getMesh() ); if( CommunicatorType::isDistributed() ) { std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; @@ -132,7 +132,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); synchronizer.setDistributedGrid( initialData->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *initialData ); + synchronizer.synchronize( *initialData ); } else { @@ -190,7 +190,7 @@ solve( DofVectorPointer& dofs ) { FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; fsm.solve( this->getMesh(), u, anisotropy, initialData ); - + makeSnapshot(); return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index a1ca740e4..14a52ec40 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod2D_impl.h * Author: oberhuber * @@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisot FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, @@ -36,7 +36,7 @@ const Index& FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, @@ -48,68 +48,68 @@ void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, typename Device, typename Index, typename Communicator, - typename Anisotropy > + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, const MeshFunctionPointer& u ) -{ +{ MeshFunctionPointer auxPtr; InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - + // Setting overlaps ( WITHOUT MPI SHOULD BE 0 ) StaticVector vecLowerOverlaps, vecUpperOverlaps; setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); - + std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - + //auxPtr->save( "aux-ini.tnl" ); - + typename MeshType::Cell cell( *mesh ); - + IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() ); - synchronizer.template synchronize< Communicator >( aux ); //synchronize initialized overlaps - - std::cout << "Calculating the values ..." << std::endl; + synchronizer.synchronize( aux ); //synchronize initialized overlaps + + std::cout << "Calculating the values ..." << std::endl; while( iteration < this->maxIterations ) { - // calculatedBefore indicates weather we calculated in the last passage of the while cycle - // calculatedBefore is same for all ranks + // calculatedBefore indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body int calculatedBefore = 1; - + // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body - int calculateMPIAgain = 1; - + int calculateMPIAgain = 1; + while( calculatedBefore ) { calculatedBefore = 0; - + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { calculateMPIAgain = 0; - + /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ /* int numThreadsPerBlock = -1; - + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); if( numThreadsPerBlock <= 16 ) @@ -127,28 +127,28 @@ solve( const MeshPointer& mesh, else numThreadsPerBlock = 1024; //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - + if( numThreadsPerBlock == -1 ){ printf("Fail in setting numThreadsPerBlock.\n"); break; } - - - + + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - + //std::cout << "numBlocksX = " << numBlocksX << std::endl; - + //Real **sArray = new Real*[numBlocksX*numBlocksY]; //for( int i = 0; i < numBlocksX * numBlocksY; i++ ) // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; - + ArrayContainer BlockIterHost; BlockIterHost.setSize( numBlocksX * numBlocksY ); BlockIterHost.setValue( 1 ); int IsCalculationDone = 1; - + MeshFunctionPointer helpFunc( mesh ); MeshFunctionPointer helpFunc1( mesh ); helpFunc1 = auxPtr; @@ -164,7 +164,7 @@ solve( const MeshPointer& mesh, // std::cout<template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); } - - - //Reduction + + + //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; @@ -196,16 +196,16 @@ solve( const MeshPointer& mesh, } numWhile++; //std::cout <<"numWhile = "<< numWhile <-1; j-- ){ // for( int i = 0; i < numBlocksX; i++ ) // std::cout << BlockIterHost[ j * numBlocksX + i ]; // std::cout << std::endl; // } // std::cout << std::endl; - + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); - + //std::cout<getDimensions().x() - vecUpperOverlaps[0]; calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save("aux-1.tnl"); - + // UP and LEFL boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-2.tnl" ); - + // DOWN and RIGHT boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-3.tnl" ); - + // DOWN and LEFT boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - + } if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA? { calculateMPIAgain = 0; - + #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 16 ); - + // Setting number of threads and blocks for kernel int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); - + // Need for calling functions from kernel BaseType ptr; - + // True if we should calculate again. int calculateCudaBlocksAgain = 1; - + // Array that identifies which blocks should be calculated. // All blocks should calculate in first passage ( setValue(1) ) TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY ); blockCalculationIndicator.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; - + // Array into which we identify the neighbours and then copy it into blockCalculationIndicator TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY ); blockCalculationIndicatorHelp.setValue( 0 ); - + // number of Blocks for kernel that calculates neighbours. int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - + // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() Containers::Vector< RealType, DeviceType, IndexType > helpVec; helpVec.setLike( auxPtr.template getData().getData() ); MeshFunctionPointer helpFunc; helpFunc->bind( mesh, helpVec ); - helpFunc.template modifyData() = auxPtr.template getData(); - + helpFunc.template modifyData() = auxPtr.template getData(); + // number of iterations of while calculateCudaBlocksAgain int numIter = 0; - + //int oddEvenBlock = 0; while( calculateCudaBlocksAgain ) { /** HERE IS CHESS METHOD (NO MPI) **/ - + /* CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -302,25 +302,25 @@ solve( const MeshPointer& mesh, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), helpFunc.template getData< Device>(), auxPtr.template modifyData< Device>(), - blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); */ /**------------------------------------------------------------------------------------------------*/ - - + + /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -328,10 +328,10 @@ solve( const MeshPointer& mesh, blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + // Switching helpFunc and auxPtr. auxPtr.swap( helpFunc ); - + // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); @@ -340,15 +340,15 @@ solve( const MeshPointer& mesh, blockCalculationIndicator = blockCalculationIndicatorHelp; cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); - + // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) if( calculateCudaBlocksAgain ){ calculatedBefore = 1; } - + /**-----------------------------------------------------------------------------------------------------------*/ numIter ++; } @@ -364,13 +364,13 @@ solve( const MeshPointer& mesh, #endif } - -/**----------------------MPI-TO-DO---------------------------------------------**/ + +/**----------------------MPI-TO-DO---------------------------------------------**/ #ifdef HAVE_MPI if( CommunicatorType::isDistributed() ){ getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); - - synchronizer.template synchronize< Communicator >( aux ); + + synchronizer.synchronize( aux ); } #endif if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! @@ -384,9 +384,9 @@ solve( const MeshPointer& mesh, // PROTECTED FUNCTIONS: -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, const MeshPointer& mesh) @@ -406,11 +406,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -bool +bool FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: -goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, MeshFunctionType& aux, const InterfaceMapType& interfaceMap, const AnisotropyPointer& anisotropy ) { @@ -418,10 +418,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, const MeshType& mesh = aux.getMesh(); const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; - + typename MeshType::Cell cell( mesh ); cell.refresh(); - + for( cell.getCoordinates().y() = boundsFrom[1]; TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; cell.getCoordinates().y() += stepY ) @@ -444,54 +444,54 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, #ifdef HAVE_MPI -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); - + int calculateFromNeighbours[4] = {0,0,0,0}; const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh MPI::Request *requestsInformation; - requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + int neighCount = 0; // should this thread calculate again? - + if( neighbours[0] != -1 ) // LEFT { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } - + if( neighbours[1] != -1 ) // RIGHT { requestsInformation[neighCount++] = - MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); } - + if( neighbours[2] != -1 ) //UP { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); } - + if( neighbours[5] != -1 ) //DOWN { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); } MPI::WaitAll( requestsInformation, neighCount ); - + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || calculateFromNeighbours[2] || calculateFromNeighbours[3]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index add4d9610..9468ff1db 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod2D_impl.h * Author: oberhuber * @@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisot FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, @@ -36,7 +36,7 @@ const Index& FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, @@ -48,7 +48,7 @@ void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, @@ -67,46 +67,46 @@ solve( const MeshPointer& mesh, InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps; setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); - + std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - auxPtr->save( "aux-ini.tnl" ); - + auxPtr->save( "aux-ini.tnl" ); + typename MeshType::Cell cell( *mesh ); - + IndexType iteration( 0 ); MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() ); - synchronizer.template synchronize< Communicator >( aux ); //synchronization of intial conditions - + synchronizer.synchronize( aux ); //synchronization of intial conditions + while( iteration < this->maxIterations ) { - // indicates weather we calculated in the last passage of the while cycle - // calculatedBefore is same for all ranks + // indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body - int calculatedBefore = 1; - + int calculatedBefore = 1; + // indicates if the MPI process should calculate again in upcoming passage of cycle // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body - int calculateMPIAgain = 1; - + int calculateMPIAgain = 1; + while( calculatedBefore ) { calculatedBefore = 0; - + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { calculateMPIAgain = 0; - + /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ /*int numThreadsPerBlock = -1; - + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); if( numThreadsPerBlock <= 16 ) @@ -124,26 +124,26 @@ solve( const MeshPointer& mesh, else numThreadsPerBlock = 1024; //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - + if( numThreadsPerBlock == -1 ){ printf("Fail in setting numThreadsPerBlock.\n"); break; } - + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); //std::cout << "numBlocksX = " << numBlocksX << std::endl; - + //Real **sArray = new Real*[numBlocksX*numBlocksY]; // for( int i = 0; i < numBlocksX * numBlocksY; i++ ) // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; - + ArrayContainer BlockIterHost; BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterHost.setValue( 1 ); int IsCalculationDone = 1; - + MeshFunctionPointer helpFunc( mesh ); MeshFunctionPointer helpFunc1( mesh ); helpFunc1 = auxPtr; @@ -159,7 +159,7 @@ solve( const MeshPointer& mesh, // std::cout<template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); } - //Reduction + //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; @@ -188,10 +188,10 @@ solve( const MeshPointer& mesh, } } numWhile++; - - + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); - + //string s( "aux-"+ std::to_string(numWhile) + ".tnl"); //aux.save( s ); } @@ -200,60 +200,60 @@ solve( const MeshPointer& mesh, } aux = *auxPtr;*/ /**------------------------------------------------------------------------------*/ - - + + /** HERE IS FSM WITH MPI AND WITHOUT MPI */ StaticVector boundsFrom; StaticVector boundsTo; - - // TOP, NORTH and EAST + + // TOP, NORTH and EAST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, NORTH and WEST + + // TOP, NORTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, SOUTH and EAST + + // TOP, SOUTH and EAST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, SOUTH and WEST + + // TOP, SOUTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, NOTH and EAST + + // BOTTOM, NOTH and EAST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, NOTH and WEST + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, NOTH and WEST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, SOUTH and EAST + + // BOTTOM, SOUTH and EAST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, SOUTH and WEST + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, SOUTH and WEST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + /**----------------------------------------------------------------------------------*/ } if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) @@ -263,50 +263,50 @@ solve( const MeshPointer& mesh, // the number should be less than 10^3 (num of threads in one grid is maximally 1024) // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 8 ); - + // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); - int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - + // Making the variables for global function CudaUpdateCellCaller. dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - + BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller - - + + int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again - + // Array containing information about each block in grid, answering question (Have we calculated in this block?) TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); // calculate all in the first passage - + // Helping Array for GetNeighbours3D TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ ); BlockIterPom.setValue( 0 ); //doesnt matter what number - - - + + + // number of neighbours in one block (1024 threads) for GetNeighbours3D int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); - - - //MeshFunctionPointer helpFunc1( mesh ); + + + //MeshFunctionPointer helpFunc1( mesh ); Containers::Vector< RealType, DeviceType, IndexType > helpVec; helpVec.setLike( auxPtr.template getData().getData() ); MeshFunctionPointer helpFunc; helpFunc->bind( mesh, helpVec ); helpFunc.template modifyData() = auxPtr.template getData(); Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); - + int numIter = 0; // number of passages of following while cycle - + while( BlockIterD ) //main body of cuda code { - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // main function that calculates all values in each blocks // calculated values are in helpFunc @@ -319,7 +319,7 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr auxPtr.swap( helpFunc ); - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) @@ -328,23 +328,23 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); - + // .containsValue(1) is actually parallel reduction implemented in TNL BlockIterD = BlockIterDevice.containsValue(1); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + numIter++; - if( BlockIterD ){ + if( BlockIterD ){ // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization calculatedBefore = 1; } } if( numIter%2 == 1 ){ - + // We need auxPtr to point on memory of original auxPtr (not to helpFunc) // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values - auxPtr.swap( helpFunc ); + auxPtr.swap( helpFunc ); Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); } cudaDeviceSynchronize(); @@ -353,35 +353,35 @@ solve( const MeshPointer& mesh, interfaceMap = *interfaceMapPtr; #endif } - + #ifdef HAVE_MPI if( CommunicatorType::isDistributed() ) { getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); - // synchronizate the overlaps - synchronizer.template synchronize< Communicator >( aux ); + // synchronizate the overlaps + synchronizer.synchronize( aux ); } #endif - + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0! calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly } //aux.save( "aux-8.tnl" ); iteration++; - + } // Saving the results into Aux for MakeSnapshot function. - Aux = auxPtr; + Aux = auxPtr; aux.save("aux-final.tnl"); } // PROTECTED FUNCTIONS: -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, const MeshPointer& mesh) @@ -402,11 +402,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -bool +bool FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: -goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, MeshFunctionType& aux, const InterfaceMapType& interfaceMap, const AnisotropyPointer& anisotropy ) { @@ -415,10 +415,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1; - + typename MeshType::Cell cell( mesh ); cell.refresh(); - + for( cell.getCoordinates().z() = boundsFrom[2]; TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0; cell.getCoordinates().z() += stepZ ) @@ -446,72 +446,72 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, #ifdef HAVE_MPI -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); - + int calculateFromNeighbours[6] = {0,0,0,0,0,0}; - + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh MPI::Request *requestsInformation; - requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + int neighCount = 0; // should this thread calculate again? - + if( neighbours[0] != -1 ) // WEST { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } - + if( neighbours[1] != -1 ) // EAST { requestsInformation[neighCount++] = - MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); } - + if( neighbours[2] != -1 ) //NORTH { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); } - + if( neighbours[5] != -1 ) //SOUTH { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); } - - if( neighbours[8] != -1 ) // TOP + + if( neighbours[8] != -1 ) // TOP { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup ); } - + if( neighbours[17] != -1 ) //BOTTOM { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup ); } - + MPI::WaitAll( requestsInformation, neighCount ); - + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || calculateFromNeighbours[2] || calculateFromNeighbours[3] || diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h index 3cc0af53a..b9ec101cf 100644 --- a/src/TNL/Functions/CutMeshFunction.h +++ b/src/TNL/Functions/CutMeshFunction.h @@ -14,9 +14,8 @@ #include namespace TNL { -namespace Functions { -template < typename CommunicatorType, - typename MeshFunctionType, +namespace Functions { +template < typename MeshFunctionType, typename OutMesh, typename OutDof, int outDimension=OutMesh::getMeshDimension(), @@ -25,10 +24,10 @@ class CutMeshFunction { public: static bool Cut(MeshFunctionType &inputMeshFunction, - OutMesh &outMesh, + OutMesh &outMesh, OutDof &outData, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs ) { bool inCut; @@ -44,7 +43,7 @@ class CutMeshFunction auto toDistributedGrid=outMesh.getDistributedMesh(); TNL_ASSERT_TRUE(toDistributedGrid!=nullptr,"You are trying cut distributed meshfunction, but output grid is not set up for distribution"); - inCut=toDistributedGrid-> template SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs); + inCut=toDistributedGrid->SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs); if(inCut) { toDistributedGrid->setupGrid(outMesh); @@ -56,7 +55,7 @@ class CutMeshFunction { typename OutMesh::PointType outOrigin; typename OutMesh::PointType outProportions; - typename OutMesh::CoordinatesType outDimensions; + typename OutMesh::CoordinatesType outDimensions; for(int i=0; i - #include #include #include @@ -20,7 +18,7 @@ namespace TNL { -namespace Meshes { +namespace Meshes { namespace DistributedMeshes { @@ -28,7 +26,7 @@ namespace DistributedMeshes { template< int Dimension, typename Real, typename Device, - typename Index > + typename Index > class DistributedMesh< Grid< Dimension, Real, Device, Index > > { public: @@ -41,44 +39,43 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > typedef Containers::StaticVector< Dimension, IndexType > CoordinatesType; typedef Containers::StaticVector< Dimension, IndexType > SubdomainOverlapsType; - static constexpr int getMeshDimension() { return Dimension; }; + static constexpr int getMeshDimension() { return Dimension; }; - static constexpr int getNeighborsCount() { return DirectionCount::get(); } //c++14 may use Directions::pow3(Dimension)-1 + static constexpr int getNeighborsCount() { return DirectionCount::get(); } //c++14 may use Directions::pow3(Dimension)-1 DistributedMesh(); ~DistributedMesh(); - + static void configSetup( Config::ConfigDescription& config ); - + bool setup( const Config::ParameterContainer& parameters, - const String& prefix ); - + const String& prefix ); + void setDomainDecomposition( const CoordinatesType& domainDecomposition ); - + const CoordinatesType& getDomainDecomposition() const; - - template< typename CommunicatorType > + void setGlobalGrid( const GridType& globalGrid ); - + const GridType& getGlobalGrid() const; - + void setOverlaps( const SubdomainOverlapsType& lower, const SubdomainOverlapsType& upper); - + void setupGrid( GridType& grid); bool isDistributed() const; - + bool isBoundarySubdomain() const; - + // TODO: replace it with getLowerOverlap() and getUpperOverlap() // It is still being used in cuts set-up const CoordinatesType& getOverlap() const { return this->overlap;}; - + //currently used overlaps at this subdomain const SubdomainOverlapsType& getLowerOverlap() const; - + const SubdomainOverlapsType& getUpperOverlap() const; //number of elements of local sub domain WITHOUT overlap @@ -95,7 +92,7 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > //number of elements of local sub domain WITH overlap // TODO: replace with localGrid const CoordinatesType& getLocalGridSize() const; - + //coordinates of begin of local subdomain without overlaps in local grid const CoordinatesType& getLocalBegin() const; @@ -104,40 +101,40 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > const PointType& getLocalOrigin() const; const PointType& getSpaceSteps() const; - //aka MPI-communcicator - void setCommunicationGroup(void * group); - void * getCommunicationGroup() const; + //aka MPI-communcicator + void setCommunicationGroup(MPI_Comm group); + MPI_Comm getCommunicationGroup() const; template< int EntityDimension > IndexType getEntitiesCount() const; template< typename Entity > - IndexType getEntitiesCount() const; + IndexType getEntitiesCount() const; const int* getNeighbors() const; - - const int* getPeriodicNeighbors() const; - template - bool SetupByCut(DistributedGridType &inputDistributedGrid, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, + const int* getPeriodicNeighbors() const; + + template + bool SetupByCut(DistributedGridType &inputDistributedGrid, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs); int getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const; - + String printProcessCoords() const; String printProcessDistr() const; - + void writeProlog( Logger& logger ); - public: - + public: + bool isThereNeighbor(const CoordinatesType &direction) const; void setupNeighbors(); - + void print( std::ostream& str ) const; GridType globalGrid; @@ -149,26 +146,26 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > //CoordinatesType globalDimensions; CoordinatesType globalBegin; PointType spaceSteps; - + SubdomainOverlapsType lowerOverlap, upperOverlap, globalLowerOverlap, globalUpperOverlap; CoordinatesType domainDecomposition; - CoordinatesType subdomainCoordinates; + CoordinatesType subdomainCoordinates; // TODO: static arrays int neighbors[ getNeighborsCount() ]; int periodicNeighbors[ getNeighborsCount() ]; - IndexType Dimensions; + IndexType Dimensions; bool distributed; - + int rank; int nproc; bool isSet; - //aka MPI-communicator - void * communicationGroup; + //aka MPI-communicator + MPI_Comm group; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp index a35b53962..c48fec9af 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp @@ -11,9 +11,9 @@ #pragma once #include -#include #include "DistributedGrid.h" +#include namespace TNL { namespace Meshes { @@ -28,8 +28,6 @@ template DistributedMesh< Grid< Dimension, Real, Device, Index > >:: ~DistributedMesh() { - if(isSet && this->communicationGroup!=nullptr) - std::free(this->communicationGroup); } @@ -57,7 +55,7 @@ setup( const Config::ParameterContainer& parameters, return true; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setDomainDecomposition( const CoordinatesType& domainDecomposition ) @@ -65,7 +63,7 @@ setDomainDecomposition( const CoordinatesType& domainDecomposition ) this->domainDecomposition = domainDecomposition; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getDomainDecomposition() const @@ -73,18 +71,12 @@ getDomainDecomposition() const return this->domainDecomposition; } -template< int Dimension, typename Real, typename Device, typename Index > -template< typename CommunicatorType > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setGlobalGrid( const GridType &globalGrid ) { - if(this->isSet && this->communicationGroup != nullptr) - std::free(this->communicationGroup); - this->communicationGroup= std::malloc(sizeof(typename CommunicatorType::CommunicationGroup)); - - *((typename CommunicatorType::CommunicationGroup *)this->communicationGroup) = CommunicatorType::AllGroup; - auto group=*((typename CommunicatorType::CommunicationGroup *)this->communicationGroup); + this->group = MPI::AllGroup(); this->globalGrid = globalGrid; this->isSet=true; @@ -99,15 +91,12 @@ setGlobalGrid( const GridType &globalGrid ) this->spaceSteps=globalGrid.getSpaceSteps(); this->distributed=false; - if( CommunicatorType::IsInitialized() ) + this->rank=MPI::GetRank(group); + this->nproc=MPI::GetSize(group); + //use MPI only if have more than one process + if(this->nproc>1) { - this->rank=CommunicatorType::GetRank(group); - this->nproc=CommunicatorType::GetSize(group); - //use MPI only if have more than one process - if(this->nproc>1) - { - this->distributed=true; - } + this->distributed=true; } if( !this->distributed ) @@ -127,10 +116,8 @@ setGlobalGrid( const GridType &globalGrid ) //compute node distribution int dims[ Dimension ]; for( int i = 0; i < Dimension; i++ ) - dims[ i ]= this->domainDecomposition[ i ]; - - - CommunicatorType::DimsCreate( this->nproc, Dimension, dims ); + dims[ i ] = this->domainDecomposition[ i ]; + MPI::Compute_dims( this->nproc, Dimension, dims ); for( int i = 0; i < Dimension; i++ ) this->domainDecomposition[ i ] = dims[ i ]; @@ -146,16 +133,16 @@ setGlobalGrid( const GridType &globalGrid ) for( int i = 0; i < Dimension; i++ ) { numberOfLarger[ i ] = globalGrid.getDimensions()[ i ] % this->domainDecomposition[ i ]; - + this->localSize[ i ] = globalGrid.getDimensions()[ i ] / this->domainDecomposition[ i ]; - + if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] ) this->localSize[ i ] += 1; - + if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] ) this->globalBegin[ i ] = this->subdomainCoordinates[ i ] * this->localSize[ i ]; else - this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + + this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + ( this->subdomainCoordinates[ i ] - numberOfLarger[ i ] ) * this->localSize[ i ]; } @@ -164,7 +151,7 @@ setGlobalGrid( const GridType &globalGrid ) } } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setOverlaps( const SubdomainOverlapsType& lower, @@ -191,7 +178,7 @@ setupGrid( GridType& grid) grid.setDistMesh(this); }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getSubdomainCoordinates() const @@ -199,7 +186,7 @@ getSubdomainCoordinates() const return this->subdomainCoordinates; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalOrigin() const @@ -207,15 +194,15 @@ getLocalOrigin() const return this->localOrigin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getSpaceSteps() const { return this->spaceSteps; } - -template< int Dimension, typename Real, typename Device, typename Index > + +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isDistributed() const @@ -223,7 +210,7 @@ isDistributed() const return this->distributed; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isBoundarySubdomain() const @@ -234,7 +221,7 @@ isBoundarySubdomain() const return false; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLowerOverlap() const @@ -242,7 +229,7 @@ getLowerOverlap() const return this->lowerOverlap; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getUpperOverlap() const @@ -250,7 +237,7 @@ getUpperOverlap() const return this->upperOverlap; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalSize() const @@ -258,7 +245,7 @@ getLocalSize() const return this->localSize; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalSize() const @@ -266,7 +253,7 @@ getGlobalSize() const return this->globalGrid.getDimensions(); } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::GridType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalGrid() const @@ -274,7 +261,7 @@ getGlobalGrid() const return this->globalGrid; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalBegin() const @@ -282,7 +269,7 @@ getGlobalBegin() const return this->globalBegin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalGridSize() const @@ -290,7 +277,7 @@ getLocalGridSize() const return this->localGridSize; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalBegin() const @@ -298,7 +285,7 @@ getLocalBegin() const return this->localBegin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > template< int EntityDimension > Index DistributedMesh< Grid< Dimension, Real, Device, Index > >:: @@ -307,7 +294,7 @@ getEntitiesCount() const return this->globalGrid. template getEntitiesCount< EntityDimension >(); } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > template< typename Entity > Index DistributedMesh< Grid< Dimension, Real, Device, Index > >:: @@ -316,23 +303,23 @@ getEntitiesCount() const return this->globalGrid. template getEntitiesCount< Entity >(); } -template< int Dimension, typename Real, typename Device, typename Index > -void +template< int Dimension, typename Real, typename Device, typename Index > +void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: -setCommunicationGroup(void * group) +setCommunicationGroup(MPI_Comm group) { - this->communicationGroup=group; + this->group=group; } -template< int Dimension, typename Real, typename Device, typename Index > -void * +template< int Dimension, typename Real, typename Device, typename Index > +MPI_Comm DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getCommunicationGroup() const { - return this->communicationGroup; + return this->group; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > int DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const @@ -347,7 +334,7 @@ getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const return ret; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isThereNeighbor(const CoordinatesType &direction) const @@ -365,7 +352,7 @@ isThereNeighbor(const CoordinatesType &direction) const } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setupNeighbors() @@ -378,7 +365,7 @@ setupNeighbors() this->neighbors[ i ] = this->getRankOfProcCoord( coordinates ); else this->neighbors[ i ] = -1; - + // Handling periodic neighbors for( int d = 0; d < Dimension; d++ ) { @@ -388,12 +375,12 @@ setupNeighbors() coordinates[ d ] = 0; this->periodicNeighbors[ i ] = this->getRankOfProcCoord( coordinates ); } - + //std::cout << "Setting i-th neighbour to " << neighbors[ i ] << " and " << periodicNeighbors[ i ] << std::endl; } } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const int* DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getNeighbors() const @@ -402,7 +389,7 @@ getNeighbors() const return this->neighbors; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const int* DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getPeriodicNeighbors() const @@ -412,12 +399,12 @@ getPeriodicNeighbors() const } template< int Dimension, typename Real, typename Device, typename Index > - template -bool + template +bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: -SetupByCut(DistributedGridType &inputDistributedGrid, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, +SetupByCut(DistributedGridType &inputDistributedGrid, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs) { @@ -432,21 +419,17 @@ SetupByCut(DistributedGridType &inputDistributedGrid, } //create new group with used nodes - typename CommunicatorType::CommunicationGroup *oldGroup=(typename CommunicatorType::CommunicationGroup *)(inputDistributedGrid.getCommunicationGroup()); - if(this->isSet && this->communicationGroup != nullptr) - free(this->communicationGroup); - this->communicationGroup = std::malloc(sizeof(typename CommunicatorType::CommunicationGroup)); - + const MPI_Comm oldGroup=inputDistributedGrid.getCommunicationGroup(); if(isInCut) { this->isSet=true; - + auto fromGlobalMesh=inputDistributedGrid.getGlobalGrid(); //set global grid typename GridType::PointType outOrigin; typename GridType::PointType outProportions; typename GridType::CoordinatesType outDimensions; - + for(int i=0; ispaceSteps[i]=inputDistributedGrid.getSpaceSteps()[savedDimensions[i]]; } - int newRank= getRankOfProcCoord(this->subdomainCoordinates); - - CommunicatorType::CreateNewGroup(isInCut,newRank,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup)); + int newRank = getRankOfProcCoord(this->subdomainCoordinates); + this->group = MPI::Comm_split( oldGroup, 1, newRank ); setupNeighbors(); - + bool isDistributed=false; for(int i=0;idistributed=isDistributed; - + this->globalGrid.setDimensions(outDimensions); this->globalGrid.setDomain(outOrigin,outProportions); @@ -491,7 +473,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid, } else { - CommunicatorType::CreateNewGroup(isInCut,0,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup)); + this->group = MPI::Comm_split( oldGroup, MPI_UNDEFINED, 0 ); } return false; @@ -517,7 +499,7 @@ printProcessDistr() const for(int i=1; idomainDecomposition[i]); return res; -}; +}; template< int Dimension, typename Real, typename Device, typename Index > void @@ -525,19 +507,18 @@ DistributedMesh< Grid< Dimension, Real, Device, Index > >:: writeProlog( Logger& logger ) { logger.writeParameter( "Domain decomposition:", this->getDomainDecomposition() ); -} +} -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: print( std::ostream& str ) const { - using Communicator = Communicators::MpiCommunicator; - for( int j = 0; j < Communicator::GetSize( Communicator::AllGroup ); j++ ) + for( int j = 0; j < MPI::GetSize(); j++ ) { - if( j == Communicator::GetRank( Communicator::AllGroup ) ) + if( j == MPI::GetRank() ) { - str << "Node : " << Communicator::GetRank( Communicator::AllGroup ) << std::endl + str << "Node : " << MPI::GetRank() << std::endl << " localOrigin : " << localOrigin << std::endl << " localBegin : " << localBegin << std::endl << " localSize : " << localSize << std::endl @@ -558,7 +539,7 @@ print( std::ostream& str ) const str << " " << periodicNeighbors[ i ]; str << std::endl; } - Communicator::Barrier( Communicator::AllGroup ); + MPI::Barrier(); } } diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h index 38a7c04f0..edb08baf7 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -21,11 +20,11 @@ #include namespace TNL { -namespace Meshes { +namespace Meshes { namespace DistributedMeshes { enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1, MpiIO=2 }; - + template< typename MeshFunction, DistrGridIOTypes type = LocalCopy, typename Mesh = typename MeshFunction::MeshType, @@ -34,7 +33,7 @@ class DistributedGridIO { }; -template< typename MeshFunctionType > +template< typename MeshFunctionType > class DistributedGridIO< MeshFunctionType, Dummy > { bool save(const String& fileName, MeshFunctionType &meshFunction) diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h index 99f505bba..698d7e41d 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h @@ -159,7 +159,7 @@ class DistributedGridIO_MPIIOBase meshFunction.save(fileName); } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_File file; int ok=MPI_File_open( group, @@ -182,7 +182,7 @@ class DistributedGridIO_MPIIOBase { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_Datatype ftype; MPI_Datatype atype; int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); @@ -191,7 +191,7 @@ class DistributedGridIO_MPIIOBase MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL); - if(Communicators::MpiCommunicator::GetRank(group)==0) + if(MPI::GetRank(group)==0) { MPI_File_seek(file,offset,MPI_SEEK_SET); headerSize=writeMeshFunctionHeader(file,meshFunction,dataCount); @@ -334,7 +334,7 @@ class DistributedGridIO_MPIIOBase return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_File file; if( MPI_File_open( group, @@ -357,7 +357,7 @@ class DistributedGridIO_MPIIOBase { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_Datatype ftype; MPI_Datatype atype; int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); @@ -366,7 +366,7 @@ class DistributedGridIO_MPIIOBase int headerSize=0; - if(Communicators::MpiCommunicator::GetRank(group)==0) + if(MPI::GetRank(group)==0) { MPI_File_seek(file,offset,MPI_SEEK_SET); headerSize=readMeshFunctionHeader(file,meshFunction,dataCount); @@ -443,7 +443,7 @@ class DistributedGridIO< static bool save(const String& fileName, MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { using HostVectorType = Containers::Vector; HostVectorType hostVector; @@ -452,14 +452,14 @@ class DistributedGridIO< return DistributedGridIO_MPIIOBase::save(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { using HostVectorType = Containers::Vector; HostVectorType hostVector; @@ -470,7 +470,7 @@ class DistributedGridIO< return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; }; @@ -492,26 +492,26 @@ class DistributedGridIO< static bool save(const String& fileName, MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { typename MeshFunctionType::RealType* data=meshFunction.getData().getData(); return DistributedGridIO_MPIIOBase::save(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { typename MeshFunctionType::RealType* data = meshFunction.getData().getData(); return DistributedGridIO_MPIIOBase::load(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h index 52217c336..8febf3c72 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h @@ -49,7 +49,7 @@ class DistributedGridIO_VectorField< static bool save(const String& fileName, Functions::VectorField< Size, MeshFunctionType > &vectorField) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { auto *distrGrid=vectorField.getMesh().getDistributedMesh(); if(distrGrid==NULL) @@ -58,9 +58,9 @@ class DistributedGridIO_VectorField< return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); - //write + //write MPI_File file; MPI_File_open( group, const_cast< char* >( fileName.getString() ), @@ -68,12 +68,12 @@ class DistributedGridIO_VectorField< MPI_INFO_NULL, &file); - - int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset - if(Communicators::MpiCommunicator::GetRank(group)==0) + + int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset + if(MPI::GetRank(group)==0) offset+=writeVectorFieldHeader(file,vectorField); MPI_Bcast(&offset, 1, MPI_INT,0, group); - + for( int i = 0; i < vectorField.getVectorDimension(); i++ ) { typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData(); //here manage data transfer Device... @@ -83,13 +83,13 @@ class DistributedGridIO_VectorField< return false; } - MPI_File_close(&file); - return true; + MPI_File_close(&file); + return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; - + }; #ifdef HAVE_MPI @@ -140,7 +140,7 @@ class DistributedGridIO_VectorField< static bool load(const String& fileName, Functions::VectorField &vectorField) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { auto *distrGrid=vectorField.getMesh().getDistributedMesh(); if(distrGrid==NULL) @@ -149,9 +149,9 @@ class DistributedGridIO_VectorField< return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); - //write + //write MPI_File file; MPI_File_open( group, const_cast< char* >( fileName.getString() ), @@ -159,12 +159,12 @@ class DistributedGridIO_VectorField< MPI_INFO_NULL, &file); - - int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset - if(Communicators::MpiCommunicator::GetRank(group)==0) + + int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset + if(MPI::GetRank(group)==0) offset+=readVectorFieldHeader(file,vectorField); MPI_Bcast(&offset, 1, MPI_INT,0, group); - + for( int i = 0; i < vectorField.getVectorDimension(); i++ ) { typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData(); //here manage data transfer Device... @@ -174,13 +174,13 @@ class DistributedGridIO_VectorField< return false; } - MPI_File_close(&file); - return true; + MPI_File_close(&file); + return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; - + }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h index 7bc17f920..ed68150a0 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h @@ -111,8 +111,7 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea } } - template< typename CommunicatorType, - typename MeshFunctionType, + template< typename MeshFunctionType, typename PeriodicBoundariesMaskPointer = Pointers::SharedPointer< MeshFunctionType > > void synchronize( MeshFunctionType &meshFunction, bool periodicBoundaries = false, @@ -144,9 +143,8 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea PeriodicBoundariesMaskPointer( nullptr ) ); // the mask is used only when receiving data ); //async send and receive - typename CommunicatorType::Request requests[2*this->getNeighborCount()]; - typename CommunicatorType::CommunicationGroup group; - group=*((typename CommunicatorType::CommunicationGroup *)(distributedGrid->getCommunicationGroup())); + MPI_Request requests[2*this->getNeighborCount()]; + MPI_Comm group = distributedGrid->getCommunicationGroup(); int requestsCount( 0 ); //send everything, recieve everything @@ -158,22 +156,22 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea if( neighbors[ i ] != -1 ) { //TNL_MPI_PRINT( "Sending data to node " << neighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); + requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); //TNL_MPI_PRINT( "Receiving data from node " << neighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); + requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); } else if( periodicBoundaries && sendSizes[ i ] !=0 ) { //TNL_MPI_PRINT( "Sending data to node " << periodicNeighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); + requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); //TNL_MPI_PRINT( "Receiving data to node " << periodicNeighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); + requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); } } //wait until send is done //TNL_MPI_PRINT( "Waiting for data ..." ) - CommunicatorType::WaitAll( requests, requestsCount ); + MPI::Waitall( requests, requestsCount ); //copy data from receive buffers //TNL_MPI_PRINT( "Copying data ..." ) diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h index 851ff6627..b479544f7 100644 --- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h +++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h @@ -16,23 +16,21 @@ namespace TNL { namespace Meshes { namespace DistributedMeshes { - -template< typename Mesh, - typename Communicator > + +template< typename Mesh > class SubdomainOverlapsGetter {}; -// TODO: Specializations by the grid dimension can be avoided when the MPI directions are +// TODO: Specializations by the grid dimension can be avoided when the MPI directions are // rewritten in a dimension independent way template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > > { public: - + static const int Dimension = 1; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -40,10 +38,9 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1, */ @@ -53,18 +50,17 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > > { public: - + static const int Dimension = 2; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -72,10 +68,9 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * lower.y() is overlap of the subdomain at boundary where y = 0, * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1, @@ -87,17 +82,16 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > > { public: - + static const int Dimension = 3; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -105,10 +99,9 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * lower.y() is overlap of the subdomain at boundary where y = 0, * lower.z() is overlap of the subdomain at boundary where z = 0, @@ -122,7 +115,7 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp index 9dbb1372b..aa185e1ec 100644 --- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp +++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp @@ -10,6 +10,7 @@ #pragma once +#include #include #include @@ -19,26 +20,25 @@ namespace TNL { /* * TODO: This could work when the MPI directions are rewritten - + template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, IndexType subdomainOverlapSize, const SubdomainOverlapsType& periodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( ! MPI::isDistributed() ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + for( int i = 0; i < Dimension; i++ ) { CoordinatesType neighborDirection( 0 ); @@ -47,7 +47,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, lower[ i ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ Directions::getDirection( neighborDirection ) ] != rank ) lower[ i ] = periodicBoundariesOverlapSize[ i ]; - + neighborDirection[ i ] = 1; if( subdomainCoordinates[ i ] < distributedMesh->getDomainDecomposition()[ i ] - 1 ) upper[ i ] = subdomainOverlapSize; @@ -55,15 +55,14 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ i ] = periodicBoundariesOverlapSize[ i ]; } } - + */ template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -71,13 +70,13 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -92,10 +91,9 @@ getOverlaps( const DistributedMeshType* distributedMesh, template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -103,15 +101,15 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + int rank = MPI::GetRank(); lower = 0; upper = 0; - + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -121,7 +119,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank ) upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ]; - + if( subdomainCoordinates[ 1 ] > 0 ) lower[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank ) @@ -135,10 +133,9 @@ getOverlaps( const DistributedMeshType* distributedMesh, template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -146,13 +143,13 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -162,7 +159,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank ) upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ]; - + if( subdomainCoordinates[ 1 ] > 0 ) lower[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank ) @@ -172,7 +169,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYpXz ] != rank ) upper[ 1 ] = upperPeriodicBoundariesOverlapSize[ 1 ]; - + if( subdomainCoordinates[ 2 ] > 0 ) lower[ 2 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZmYzXz ] != rank ) diff --git a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h index 135e3c15a..52c0b543b 100644 --- a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h +++ b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h @@ -94,8 +94,7 @@ resolveAndLoadDistributedMesh( Functor&& functor, return resolveDistributedMeshType< ConfigTag, Device >( wrapper, fileName, fileFormat ); } -template< typename CommunicatorType, - typename MeshConfig, +template< typename MeshConfig, typename Device > bool loadDistributedMesh( Mesh< MeshConfig, Device >& mesh, @@ -145,8 +144,7 @@ decomposeMesh( const Config::ParameterContainer& parameters, } // overloads for grids -template< typename CommunicatorType, - int Dimension, +template< int Dimension, typename Real, typename Device, typename Index > @@ -171,7 +169,7 @@ loadDistributedMesh( Grid< Dimension, Real, Device, Index >& mesh, std::cout << " [ OK ] " << std::endl; typename Meshes::DistributedMeshes::DistributedMesh>::SubdomainOverlapsType overlap; - distributedMesh.template setGlobalGrid< CommunicatorType >( globalGrid ); + distributedMesh.setGlobalGrid( globalGrid ); distributedMesh.setupGrid(mesh); return true; } @@ -191,7 +189,6 @@ decomposeMesh( const Config::ParameterContainer& parameters, using GridType = Grid< Dimension, Real, Device, Index >; using DistributedGridType = DistributedMeshes::DistributedMesh< GridType >; using SubdomainOverlapsType = typename DistributedGridType::SubdomainOverlapsType; - using CommunicatorType = typename Problem::CommunicatorType; SubdomainOverlapsType lower( 0 ), upper( 0 ); distributedMesh.setOverlaps( lower, upper ); diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h index 1da61c51e..131697afb 100644 --- a/src/TNL/Problems/HeatEquationProblem_impl.h +++ b/src/TNL/Problems/HeatEquationProblem_impl.h @@ -146,7 +146,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); synchronizer.setDistributedGrid( uPointer->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *uPointer ); + synchronizer.synchronize( *uPointer ); } else { @@ -173,7 +173,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > - template< typename MatrixPointer > + template< typename MatrixPointer > bool HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: setupLinearSystem( MatrixPointer& matrixPointer ) @@ -247,7 +247,7 @@ getExplicitUpdate( const RealType& time, * * You may use supporting vectors again if you need. */ - + this->bindDofs( uDofs ); this->fuPointer->bind( this->getMesh(), *fuDofs ); this->explicitUpdater.template update< typename Mesh::Cell, Communicator >( time, tau, this->getMesh(), this->uPointer, this->fuPointer ); @@ -258,7 +258,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > -void +void HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: applyBoundaryConditions( const RealType& time, DofVectorPointer& uDofs ) @@ -272,7 +272,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > - template< typename MatrixPointer > + template< typename MatrixPointer > void HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: assemblyLinearSystem( const RealType& time, @@ -282,7 +282,7 @@ assemblyLinearSystem( const RealType& time, DofVectorPointer& bPointer ) { this->bindDofs( dofsPointer ); - this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( + this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( time, tau, this->getMesh(), diff --git a/src/TNL/Problems/PDEProblem_impl.h b/src/TNL/Problems/PDEProblem_impl.h index 6a3aa63e6..f42f18b16 100644 --- a/src/TNL/Problems/PDEProblem_impl.h +++ b/src/TNL/Problems/PDEProblem_impl.h @@ -59,7 +59,7 @@ template< typename Mesh, typename PDEProblem< Mesh, Communicator, Real, Device, Index >::IndexType PDEProblem< Mesh, Communicator, Real, Device, Index >:: subdomainOverlapSize() -{ +{ return 1; } @@ -77,9 +77,9 @@ getSubdomainOverlaps( const Config::ParameterContainer& parameters, SubdomainOverlapsType& upper ) { using namespace Meshes::DistributedMeshes; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() ); } - + template< typename Mesh, typename Communicator, typename Real, diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h index 0c605fb95..34f2798f8 100644 --- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h @@ -63,7 +63,7 @@ setup( const Config::ParameterContainer& parameters, const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" ); this->distributedMesh.setup( parameters, prefix ); if( Problem::CommunicatorType::isDistributed() ) { - if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) + if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) return false; if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) ) return false; diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h index 5292e7f41..880d0ab31 100644 --- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h @@ -54,7 +54,7 @@ setup( const Config::ParameterContainer& parameters, const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" ); this->distributedMesh.setup( parameters, prefix ); if( Problem::CommunicatorType::isDistributed() ) { - if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) + if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) return false; if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) ) return false; diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h index fa1d23951..dbecdaad9 100644 --- a/src/TNL/Solvers/SolverStarter_impl.h +++ b/src/TNL/Solvers/SolverStarter_impl.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -24,14 +24,14 @@ #include namespace TNL { -namespace Solvers { +namespace Solvers { template< typename Problem, typename ConfigTag, bool TimeDependent = Problem::isTimeDependent() > class TimeDependencyResolver {}; - + template< typename Problem, typename ConfigTag, typename TimeStepper = typename Problem::TimeStepper > @@ -65,7 +65,7 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame */ if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! Communicators::MpiCommunicator::setup( parameters ) + ! MPI::setup( parameters ) ) return false; Problem problem; @@ -93,7 +93,7 @@ class TimeDependencyResolver< Problem, ConfigTag, false > const Config::ParameterContainer& parameters ) { // TODO: This should be improved - at least rename to LinearSolverSetter - return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters ); + return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters ); } }; @@ -336,7 +336,7 @@ bool SolverStarter< ConfigTag > :: runPDESolver( Problem& problem, */ this->computeTimer.reset(); this->ioTimer.reset(); - + /**** * Create solver monitor thread */ diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h index 5c35c7c33..bc1f43c77 100644 --- a/src/TNL/Solvers/Solver_impl.h +++ b/src/TNL/Solvers/Solver_impl.h @@ -15,8 +15,8 @@ #include #include #include -#include #include +#include namespace TNL { namespace Solvers { @@ -35,7 +35,7 @@ run( int argc, char* argv[] ) configDescription.addDelimiter( "Parallelization setup:" ); Devices::Host::configSetup( configDescription ); Devices::Cuda::configSetup( configDescription ); - Communicators::MpiCommunicator::configSetup( configDescription ); + MPI::configSetup( configDescription ); TNL::MPI::ScopedInitializer mpi( argc, argv ); diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp index 73765aafb..a1b3a8ff3 100644 --- a/src/Tools/tnl-init.cpp +++ b/src/Tools/tnl-init.cpp @@ -15,8 +15,8 @@ #include #include -#include #include +#include using namespace TNL; @@ -53,7 +53,7 @@ int main( int argc, char* argv[] ) Config::ConfigDescription configDescription; setupConfig( configDescription ); - Communicators::MpiCommunicator::configSetup( configDescription ); + TNL::MPI::configSetup( configDescription ); TNL::MPI::ScopedInitializer mpi(argc, argv); diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h index 8a4024ac6..e78db1153 100644 --- a/src/Tools/tnl-init.h +++ b/src/Tools/tnl-init.h @@ -10,6 +10,7 @@ #pragma once +#include #include #include #include @@ -21,8 +22,6 @@ #include #include -#include - using namespace TNL; template< typename MeshType, @@ -32,25 +31,23 @@ template< typename MeshType, int zDiff > bool renderFunction( const Config::ParameterContainer& parameters ) { - using CommunicatorType = Communicators::MpiCommunicator; - using namespace Meshes::DistributedMeshes; using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh; DistributedGridType distributedMesh; Pointers::SharedPointer< MeshType > meshPointer; MeshType globalMesh; - if(CommunicatorType::isDistributed()) + if(TNL::MPI::GetSize() > 1) { //suppose global mesh loaded from single file String meshFile = parameters.getParameter< String >( "mesh" ); std::cout << "+ -> Loading mesh from " << meshFile << " ... " << std::endl; globalMesh.load( meshFile ); - + // TODO: This should work with no overlaps - distributedMesh.template setGlobalGrid(globalMesh); + distributedMesh.setGlobalGrid(globalMesh); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 ); distributedMesh.setOverlaps( lowerOverlap, upperOverlap ); distributedMesh.setupGrid(*meshPointer); } @@ -73,7 +70,7 @@ bool renderFunction( const Config::ParameterContainer& parameters ) MeshFunctionPointer meshFunction( meshPointer ); //if( ! discreteFunction.setSize( mesh.template getEntitiesCount< typename MeshType::Cell >() ) ) // return false; - + double finalTime = parameters.getParameter< double >( "final-time" ); double initialTime = parameters.getParameter< double >( "initial-time" ); double tau = parameters.getParameter< double >( "snapshot-period" ); @@ -115,7 +112,7 @@ bool renderFunction( const Config::ParameterContainer& parameters ) else std::cout << "+ -> Writing the function to " << outputFile << " ... " << std::endl; - if(CommunicatorType::isDistributed()) + if(TNL::MPI::GetSize() > 1) { if( ! Meshes::DistributedMeshes::DistributedGridIO ::save(outputFile, *meshFunction ) ) return false; diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp index 54071032c..dccd68f23 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp @@ -1,9 +1,8 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include @@ -12,30 +11,25 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Devices; -using namespace TNL::Communicators; -typedef MpiCommunicator CommunicatorType; - -template< - typename MeshType, - typename CommunicatorType> +template< typename MeshType > void SetUpDistributedGrid(DistributedMesh &distributedGrid, MeshType &globalGrid,int size,typename MeshType::CoordinatesType distribution ) { typename MeshType::PointType globalOrigin; typename MeshType::PointType globalProportions; using DistributedMeshType = DistributedMesh< MeshType >; - + globalOrigin.setValue( -0.5 ); globalProportions.setValue( size ); globalGrid.setDimensions( size ); globalGrid.setDomain( globalOrigin,globalProportions ); - + distributedGrid.setDomainDecomposition( distribution ); - distributedGrid.template setGlobalGrid(globalGrid); + distributedGrid.setGlobalGrid(globalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); } @@ -44,47 +38,47 @@ void SetUpDistributedGrid(DistributedMesh &distributedGrid, MeshType & TEST(CutDistributedGirdTest_2D, IsInCut) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), StaticVector<1,int>(5) ); - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%3==1) + if(TNL::MPI::GetRank()%3==1) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_2D, GloblaGridDimesion) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -92,24 +86,24 @@ TEST(CutDistributedGirdTest_2D, GloblaGridDimesion) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; } } TEST(CutDistributedGirdTest_2D, IsDistributed) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -123,17 +117,17 @@ TEST(CutDistributedGirdTest_2D, IsDistributed) TEST(CutDistributedGirdTest_2D, IsNotDistributed) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(12,1)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(12,1)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -149,47 +143,47 @@ TEST(CutDistributedGirdTest_2D, IsNotDistributed) TEST(CutDistributedGirdTest_3D, IsInCut_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), StaticVector<2,int>(2,2) ); - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%4==0) + if(TNL::MPI::GetRank()%4==0) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -197,24 +191,24 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; } } TEST(CutDistributedGirdTest_3D, IsDistributed_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -228,17 +222,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_1D) TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1)); + SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -254,48 +248,48 @@ TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D) TEST(CutDistributedGirdTest_3D, IsInCut_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), StaticVector<1,int>(5) ); - int rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); + int rank=TNL::MPI::GetRank(); if(rank>3 && rank<8) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), @@ -303,7 +297,7 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),2) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().y(),10) << "Rozměry globálního gridu neodpovídají"; } } @@ -311,17 +305,17 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) TEST(CutDistributedGirdTest_3D, IsDistributed_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), @@ -335,17 +329,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_2D) TEST(CutDistributedGirdTest_3D, IsNotDistributed_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12)); + SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp index 4d5bb4baf..9ad46b412 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -18,9 +17,6 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Devices; -using namespace TNL::Communicators; - -typedef MpiCommunicator CommunicatorType; static const char* TEST_FILE_NAME = "test_CutDistributedMeshFunctionTest.tnl"; @@ -52,9 +48,9 @@ TEST(CutDistributedMeshFunction, 2D_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); @@ -73,14 +69,14 @@ TEST(CutDistributedMeshFunction, 2D_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -134,9 +130,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -154,14 +150,14 @@ TEST(CutDistributedMeshFunction, 3D_1_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(2), StaticVector<2,int>(1,0), @@ -215,9 +211,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -235,14 +231,14 @@ TEST(CutDistributedMeshFunction, 3D_2_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -302,9 +298,9 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -325,7 +321,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -338,7 +334,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -387,9 +383,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid( globalOriginalGrid ); + distributedGrid.setGlobalGrid( globalOriginalGrid ); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -410,7 +406,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<2,int>(0,2), @@ -423,7 +419,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -476,9 +472,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) overlap.setValue(1); DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -499,7 +495,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -512,7 +508,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -563,9 +559,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) overlap.setValue(1); DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid( globalOriginalGrid ); + distributedGrid.setGlobalGrid( globalOriginalGrid ); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -586,7 +582,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -600,9 +596,8 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) DistributedGridIO,MpiIO> ::save(TEST_FILE_NAME, cutMeshFunction ); //save globalgrid for debug render - typename CommunicatorType::CommunicationGroup *group; - group=(typename CommunicatorType::CommunicationGroup *)(cutDistributedGrid.getCommunicationGroup()); - if(CommunicatorType::GetRank(*group)==0) + MPI_Comm group=cutDistributedGrid.getCommunicationGroup(); + if(TNL::MPI::GetRank(group)==0) { File meshFile; meshFile.open( TEST_FILE_NAME+String("-mesh.tnl"),std::ios_base::out); @@ -612,7 +607,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) } - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { Pointers::SharedPointer globalCutGrid; MeshFunctionView loadMeshFunctionptr; diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp index 640aa5180..6621a01dd 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp @@ -1,11 +1,10 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include #include #include #include #include -#include #include "../../Functions/Functions.h" @@ -14,7 +13,6 @@ using namespace TNL::Containers; using namespace TNL::Functions; using namespace TNL::Meshes; using namespace TNL::Devices; -using namespace TNL::Communicators; TEST(CutMeshFunction, 2D) @@ -28,12 +26,12 @@ TEST(CutMeshFunction, 2D) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -43,18 +41,18 @@ TEST(CutMeshFunction, 2D) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(0), StaticVector<1,int>(1), StaticVector<1,typename CutMeshType::IndexType>(5) ); @@ -62,13 +60,13 @@ TEST(CutMeshFunction, 2D) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=i; fromEntity.getCoordinates().y()=5; outEntity.getCoordinates().x()=i; @@ -91,12 +89,12 @@ TEST(CutMeshFunction, 3D_1) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -106,18 +104,18 @@ TEST(CutMeshFunction, 3D_1) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<2,int>(0,2), StaticVector<2,typename CutMeshType::IndexType>(5,5) ); @@ -125,13 +123,13 @@ TEST(CutMeshFunction, 3D_1) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=5; fromEntity.getCoordinates().y()=i; fromEntity.getCoordinates().z()=5; @@ -154,12 +152,12 @@ TEST(CutMeshFunction, 3D_2) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -169,18 +167,18 @@ TEST(CutMeshFunction, 3D_2) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(2,1), StaticVector<1,int>(0), StaticVector<1,typename CutMeshType::IndexType>(5) ); @@ -188,7 +186,7 @@ TEST(CutMeshFunction, 3D_2) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { @@ -196,7 +194,7 @@ TEST(CutMeshFunction, 3D_2) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=5; fromEntity.getCoordinates().y()=j; fromEntity.getCoordinates().z()=i; diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h index 6b7c489af..11a85b68d 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h @@ -6,7 +6,6 @@ email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ -#include #include #include #include @@ -18,7 +17,6 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; @@ -186,8 +184,6 @@ class ParameterProvider<3,Device> //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedGridIO { @@ -227,9 +223,9 @@ class TestDistributedGridIO overlap.setValue(1); DistributedGridType distributedGrid; distributedGrid.setDomainDecomposition( parameters.getDistr() ); - distributedGrid.template setGlobalGrid( globalGrid ); + distributedGrid.setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); //std::cout << distributedGrid.printProcessDistr() < localGridptr; localGridptr->setDimensions(localProportions); @@ -313,14 +309,14 @@ class TestDistributedGridIO overlap.setValue(1); DistributedGridType distributedGrid; distributedGrid.setDomainDecomposition( parameters.getDistr() ); - distributedGrid.template setGlobalGrid( globalGrid ); + distributedGrid.setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); //save files from local mesh - PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup)); - PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));; + PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank()); + PointType localProportions=parameters.getProportions(TNL::MPI::GetRank()); Pointers::SharedPointer localGridptr; localGridptr->setDimensions(localProportions); @@ -355,7 +351,7 @@ class TestDistributedGridIO DistributedMeshSynchronizer< DistributedGridType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof //Crete "distributedgrid driven" grid filed by evaluated linear function Pointers::SharedPointer gridptr; @@ -367,7 +363,7 @@ class TestDistributedGridIO meshFunctionptr->bind(gridptr,dof); linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); for(int i=0;i #include #include -#include #include #include @@ -24,13 +23,10 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedGridMPIIO{ public: @@ -63,9 +59,9 @@ class TestDistributedGridMPIIO{ globalGrid->setDomain(globalOrigin,globalProportions); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); ///std::cout << distributedGrid.printProcessDistr() < ::save(FileName, *meshFunctionptr ); //first process compare results - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType globalEvaluatedDof(globalGrid->template getEntitiesCount< Cell >()); @@ -131,15 +127,15 @@ class TestDistributedGridMPIIO{ CoordinatesType overlap; overlap.setValue(1); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); String FileName=String("test-file-mpiio-load.tnl"); //Prepare file - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType saveDof(globalGrid->template getEntitiesCount< Cell >()); @@ -165,7 +161,7 @@ class TestDistributedGridMPIIO{ DistributedMeshSynchronizer< DistributedGridType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof Pointers::SharedPointer evalGridPtr; Pointers::SharedPointer evalMeshFunctionptr; @@ -176,14 +172,14 @@ class TestDistributedGridMPIIO{ evalMeshFunctionptr->bind(evalGridPtr,evalDof); linearFunctionEvaluator.evaluateAllEntities(evalMeshFunctionptr , linearFunctionPtr); - synchronizer.template synchronize( *evalMeshFunctionptr ); + synchronizer.synchronize( *evalMeshFunctionptr ); for(int i=0;i -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include #include @@ -26,7 +25,6 @@ using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; template @@ -44,13 +42,13 @@ void check_Boundary_1D(int rank, int nproc, const DofType& dof, typename DofType EXPECT_EQ( dof[0], expectedValue) << "Left boundary test failed"; return; } - + if(rank==(nproc-1))//Right { EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Right boundary test failed"; return; } - + }; template @@ -61,15 +59,15 @@ void check_Overlap_1D(int rank, int nproc, const DofType& dof, typename DofType: EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Left boundary node overlap test failed"; return; } - + if( rank == ( nproc - 1 ) ) { EXPECT_EQ( dof[0], expectedValue) << "Right boundary node overlap test failed"; return; } - + EXPECT_EQ( dof[0], expectedValue) << "left overlap test failed"; - EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed"; + EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed"; }; template @@ -80,25 +78,24 @@ void check_Inner_1D(int rank, int nproc, const DofType& dof, typename DofType::R }; /* - * Light check of 1D distributed grid and its synchronization. + * Light check of 1D distributed grid and its synchronization. * Number of process is not limited. * Overlap is limited to 1 * Only double is tested as dof Real type -- it may be changed, extend test * Global size is hardcoded as 10 -- it can be changed, extend test */ -typedef MpiCommunicator CommunicatorType; typedef Grid<1,double,Host,int> GridType; typedef MeshFunctionView< GridType > MeshFunctionType; typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType; typedef Vector< double,Host,int> DofType; typedef Vector< bool, Host, int > MaskDofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; - + class DistributedGridTest_1D : public ::testing::Test { protected: @@ -123,14 +120,14 @@ class DistributedGridTest_1D : public ::testing::Test void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; GridType globalGrid; - globalOrigin.x()=-0.5; + globalOrigin.x()=-0.5; globalProportions.x()=size; @@ -142,9 +139,9 @@ class DistributedGridTest_1D : public ::testing::Test distributedGrid=new DistributedGridType(); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - distributedGrid->template setGlobalGrid( globalGrid ); - //distributedGrid->setupGrid(*gridptr); - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + distributedGrid->setGlobalGrid( globalGrid ); + //distributedGrid->setupGrid(*gridptr); + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); @@ -155,14 +152,14 @@ class DistributedGridTest_1D : public ::testing::Test constFunctionPtr->Number=rank; } - + void SetUpPeriodicBoundaries() { typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); - distributedGrid->setupGrid(*gridptr); + distributedGrid->setupGrid(*gridptr); } void TearDown() @@ -209,7 +206,7 @@ TEST_F(DistributedGridTest_1D, evaluateInteriorEntities) check_Boundary_1D(rank, nproc, dof, -1); check_Overlap_1D(rank, nproc, dof, -1); check_Inner_1D(rank, nproc, dof, rank); -} +} TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) { @@ -217,7 +214,7 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); + synchronizer.synchronize( *meshFunctionPtr ); if(rank!=0) { EXPECT_EQ((dof)[0],rank-1)<< "Left Overlap was filled by wrong process."; @@ -229,12 +226,12 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) TEST_F(DistributedGridTest_1D, EvaluateLinearFunction ) { - //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) + //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) setDof_1D(dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); + synchronizer.synchronize( *meshFunctionPtr ); auto entity = gridptr->template getEntity< Cell >(0); entity.refresh(); @@ -250,7 +247,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -258,13 +255,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask ) maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridptr, dof ); maskPointer->bind( gridptr, maskDofs ); - + setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true ); + synchronizer.synchronize( *meshFunctionPtr, true ); if( rank == 0 ) { EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process."; @@ -279,7 +276,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -287,14 +284,14 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask ) maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridptr, dof ); maskPointer->bind( gridptr, maskDofs ); - + setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr, constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true, maskPointer ); + synchronizer.synchronize( *meshFunctionPtr, true, maskPointer ); if( rank == 0 ) { EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process."; } @@ -310,7 +307,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -325,9 +322,9 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); TNL_MPI_PRINT( "#### " << dof ); - meshFunctionPtr->template synchronize( true, maskPointer ); + meshFunctionPtr->synchronize( true, maskPointer ); TNL_MPI_PRINT( ">>> " << dof ); - + if( rank == 0 ) EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process."; if( rank == nproc-1 ) @@ -339,7 +336,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -350,27 +347,27 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask ) setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); - maskDofs.setElement( 1, false ); + maskDofs.setElement( 1, false ); maskDofs.setElement( dof.getSize() - 2, false ); //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process."; if( rank == nproc-1 ) - EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process."; - + EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process."; + } */ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -382,13 +379,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest ) Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true ); + synchronizer.synchronize( *meshFunctionPtr, true ); auto entity = gridptr->template getEntity< Cell >( 0 ); auto entity2= gridptr->template getEntity< Cell >( (dof).getSize() - 1 ); entity.refresh(); entity2.refresh(); - + if( rank == 0 ) { EXPECT_EQ( meshFunctionPtr->getValue(entity), 9 ) << "Linear function Overlap error on left Edge."; } diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp index 71370cae2..1f02dd236 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp @@ -7,14 +7,13 @@ ***************************************************************************/ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI #include #include -#include #include #include @@ -25,10 +24,9 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; - + template void setDof_2D( DofType &dof, typename DofType::RealType value ) @@ -46,7 +44,7 @@ void checkLeftEdge( const GridType &grid, const DofType &dof, bool with_first, b int end = maxy; if( !with_first ) begin++; if( !with_last ) end--; - + for( int i=begin;i void check_Boundary_2D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue) -{ +{ if(rank==0)//Up Left { checkUpEdge(grid,dof,true,false,expectedValue);//posledni je overlap checkLeftEdge(grid,dof,true,false, expectedValue);//posledni je overlap } - + if(rank==1)//Up Center { checkUpEdge(grid,dof,false,false, expectedValue);//prvni a posledni je overlap } - + if(rank==2)//Up Right { checkUpEdge(grid,dof,false,true,expectedValue);//prvni je overlap checkRightEdge(grid,dof,true,false,expectedValue);//posledni je overlap } - + if(rank==3)//Center Left { checkLeftEdge(grid,dof,false,false,expectedValue);//prvni a posledni je overlap } - + if(rank==4)//Center Center { //No boundary } - + if(rank==5)//Center Right { checkRightEdge(grid,dof,false,false,expectedValue); } - + if(rank==6)//Down Left { checkDownEdge(grid,dof,true,false,expectedValue); checkLeftEdge(grid,dof,false,true,expectedValue); } - + if(rank==7) //Down Center { checkDownEdge(grid,dof,false,false,expectedValue); } - + if(rank==8) //Down Right { checkDownEdge(grid,dof,false,true,expectedValue); @@ -241,27 +239,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena checkRightEdge(grid,dof,false,true,expectedValue); checkDownEdge(grid,dof,false,true,expectedValue); } - + if(rank==1)//Up Center { checkDownEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,false,true,expectedValue); } - + if(rank==2)//Up Right { checkDownEdge(grid,dof,true,false,expectedValue);//prvni je overlap checkLeftEdge(grid,dof,false,true,expectedValue); } - + if(rank==3)//Center Left { checkUpEdge(grid,dof,false,true,expectedValue); checkDownEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,true,true,expectedValue); } - + if(rank==4)//Center Center { checkUpEdge(grid,dof,true,true,expectedValue); @@ -269,27 +267,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena checkRightEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,true,true,expectedValue); } - + if(rank==5)//Center Right { checkUpEdge(grid,dof,true,false,expectedValue); checkDownEdge(grid,dof,true,false,expectedValue); checkLeftEdge(grid,dof,true,true,expectedValue); } - + if(rank==6)//Down Left { checkUpEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,true,false,expectedValue); } - + if(rank==7) //Down Center { checkUpEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,true,false,expectedValue); checkRightEdge(grid,dof,true,false,expectedValue); } - + if(rank==8) //Down Right { checkUpEdge(grid,dof,true,false,expectedValue); @@ -310,26 +308,25 @@ void check_Inner_2D(int rank, const GridType& grid, const DofType& dof, typename } /* - * Light check of 2D distributed grid and its synchronization. + * Light check of 2D distributed grid and its synchronization. * expected 9 processes */ -typedef MpiCommunicator CommunicatorType; typedef Grid<2,double,Host,int> GridType; typedef MeshFunctionView MeshFunctionType; typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType; typedef Vector DofType; typedef Vector< bool, Host, int > MaskDofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; class DistributedGridTest_2D : public ::testing::Test { - + public: - + using CoordinatesType = typename GridType::CoordinatesType; DistributedGridType *distributedGrid; @@ -347,20 +344,20 @@ class DistributedGridTest_2D : public ::testing::Test Pointers::SharedPointer< LinearFunction, Host > linearFunctionPtr; int rank; - int nproc; + int nproc; void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; GridType globalGrid; globalOrigin.x()=-0.5; - globalOrigin.y()=-0.5; + globalOrigin.y()=-0.5; globalProportions.x()=size; globalProportions.y()=size; @@ -369,9 +366,9 @@ class DistributedGridTest_2D : public ::testing::Test distributedGrid=new DistributedGridType(); distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3 ) ); - distributedGrid->template setGlobalGrid( globalGrid ); + distributedGrid->setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -422,17 +419,17 @@ TEST_F(DistributedGridTest_2D, evaluateInteriorEntities) check_Boundary_2D(rank, *gridPtr, *dof, -1); check_Overlap_2D(rank, *gridPtr, *dof, -1); check_Inner_2D(rank, *gridPtr, *dof, rank); -} +} TEST_F(DistributedGridTest_2D, LinearFunctionTest) { - //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) + //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) setDof_2D(*dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); - + synchronizer.synchronize( *meshFunctionPtr ); + int count =gridPtr->template getEntitiesCount< Cell >(); for(int i=0;igetMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); - + synchronizer.synchronize( *meshFunctionPtr ); + // checkNeighbor_2D(rank, *gridPtr, *dof); - + if(rank==0)//Up Left { checkRightEdge(*gridPtr, *dof, true, false, 1 ); checkDownEdge( *gridPtr, *dof, true, false, 3 ); checkCorner( *gridPtr, *dof, false, false, 4 ); } - + if(rank==1)//Up Center { checkLeftEdge( *gridPtr, *dof, true, false, 0 ); @@ -468,14 +465,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, false, false, 4 ); checkCorner( *gridPtr, *dof, false, false, 5 ); } - + if(rank==2)//Up Right { checkLeftEdge( *gridPtr, *dof, true, false, 1 ); checkCorner( *gridPtr, *dof, false, true, 4 ); checkDownEdge( *gridPtr, *dof, false, true, 5 ); } - + if(rank==3)//Center Left { checkUpEdge( *gridPtr, *dof, true, false, 0 ); @@ -484,7 +481,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, true, false, 6 ); checkCorner( *gridPtr, *dof, false, false, 7 ); } - + if(rank==4)//Center Center { checkCorner( *gridPtr, *dof, true, true, 0 ); @@ -496,7 +493,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, false, false, 7 ); checkCorner( *gridPtr, *dof, false, false, 8 ); } - + if(rank==5)//Center Right { checkCorner( *gridPtr, *dof, true, true, 1 ); @@ -505,14 +502,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkCorner( *gridPtr, *dof, false, true, 7 ); checkDownEdge( *gridPtr, *dof, false, true, 8 ); } - + if(rank==6)//Down Left { checkUpEdge( *gridPtr, *dof, true, false, 3 ); checkCorner( *gridPtr, *dof, true, false, 4 ); checkRightEdge( *gridPtr, *dof, false, true, 7 ); } - + if(rank==7) //Down Center { checkCorner( *gridPtr, *dof, true, true, 3 ); @@ -521,77 +518,77 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkLeftEdge( *gridPtr, *dof, false, true, 6 ); checkRightEdge( *gridPtr, *dof, false, true, 8 ); } - + if(rank==8) //Down Right { checkCorner( *gridPtr, *dof, true, true, 4 ); checkUpEdge( *gridPtr, *dof, false, true, 5 ); checkLeftEdge( *gridPtr, *dof, false, true, 7 ); - } + } } -// TODO: Fix tests for periodic BC - +// TODO: Fix tests for periodic BC - // checkLeftBoundary -> checkLeft Overlap etc. for direction BoundaryToOverlap // Fix the tests with mask to work with the direction OverlapToBoundary /* TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); dof->setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true ); - + meshFunctionPtr->synchronize( true ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -609,10 +606,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveMask ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -620,13 +617,13 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); + meshFunctionPtr->synchronize( true, maskPointer ); if( rank == 0 ) { @@ -634,39 +631,39 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -684,10 +681,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiveMaskOnLeft ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -695,7 +692,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -711,47 +708,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, 0 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, 3 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, 6 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -769,10 +766,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskOnRight ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -780,7 +777,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -796,47 +793,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, 2 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, 5 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -854,10 +851,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskUp ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -865,7 +862,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -881,47 +878,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, 0 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, 1 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, 2 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -939,10 +936,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskDown ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -950,7 +947,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -966,47 +963,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, 6 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -1020,7 +1017,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv checkRightBoundary( *gridPtr, *dof, true, false, -7 ); } } -*/ +*/ #endif #endif diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp index 765341c1e..4f552dee5 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp @@ -1,9 +1,8 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include #include @@ -16,8 +15,7 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; -using namespace TNL::Meshes::DistributedMeshes; +using namespace TNL::Meshes::DistributedMeshes; template void setDof_3D(DofType &dof, typename DofType::RealType value) @@ -49,14 +47,14 @@ void checkConner(const GridType &grid, const DofType &dof,bool bottom, bool nort { int i=getAdd(grid,bottom,north,west); EXPECT_EQ( dof[i], expectedValue) << "Conner test failed"; - + } template void checkXDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool north, typename DofType::RealType expectedValue) { - int add=getAdd(grid,bottom,north,true); - for(int i=1;i void checkYDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool west, typename DofType::RealType expectedValue) { int add=getAdd(grid,bottom,true,west); - for(int i=1;i void checkZDirectionEdge(const GridType &grid, const DofType &dof, bool north, bool west, typename DofType::RealType expectedValue) { int add=getAdd(grid,true,north,west); - for(int i=1;i @@ -429,7 +427,7 @@ void CheckYFaceNode_Overlap(const GridType &grid, const DofType &dof,bool north, checkXFace(grid, dof, true, expectedValue); checkYFace(grid, dof, !north, expectedValue); checkZFace(grid, dof, false, expectedValue); - checkZFace(grid, dof, true, expectedValue); + checkZFace(grid, dof, true, expectedValue); } template @@ -451,7 +449,7 @@ void CheckZFaceNode_Overlap(const GridType &grid, const DofType &dof,bool bottom checkXFace(grid, dof, true, expectedValue); checkYFace(grid, dof, false, expectedValue); checkYFace(grid, dof, true, expectedValue); - checkZFace(grid, dof, !bottom, expectedValue); + checkZFace(grid, dof, !bottom, expectedValue); } template @@ -484,11 +482,11 @@ void CheckCentralNode_Overlap(const GridType &grid, const DofType &dof,typename checkYFace(grid, dof, false, expectedValue); checkYFace(grid, dof, true, expectedValue); checkZFace(grid, dof, false, expectedValue); - checkZFace(grid, dof, true, expectedValue); + checkZFace(grid, dof, true, expectedValue); } /* -* Expected 27 processes. +* Expected 27 processes. */ template void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue) @@ -499,7 +497,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena if(rank==1) CheckXEdgeNode_Overlap(grid,dof,true,true,expectedValue); - if(rank==2) + if(rank==2) CheckConnerNode_Overlap(grid,dof,true,true,false,expectedValue); if(rank==3) @@ -553,7 +551,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena if(rank==19) CheckXEdgeNode_Overlap(grid,dof,false,true,expectedValue); - if(rank==20) + if(rank==20) CheckConnerNode_Overlap(grid,dof,false,true,false,expectedValue); if(rank==21) @@ -590,19 +588,18 @@ void check_Inner_3D(int rank, const GridType& grid, const DofType& dof, typename /* - * Light check of 3D distributed grid and its synchronization. + * Light check of 3D distributed grid and its synchronization. * expected 27 processes */ -typedef MpiCommunicator CommunicatorType; typedef Grid<3,double,Host,int> GridType; typedef MeshFunctionView MeshFunctionType; typedef Vector DofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; - + class DistributedGirdTest_3D : public ::testing::Test { protected: @@ -620,14 +617,14 @@ class DistributedGirdTest_3D : public ::testing::Test Pointers::SharedPointer< LinearFunction, Host > linearFunctionPtr; int rank; - int nproc; + int nproc; void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; @@ -635,7 +632,7 @@ class DistributedGirdTest_3D : public ::testing::Test globalOrigin.x()=-0.5; globalOrigin.y()=-0.5; - globalOrigin.z()=-0.5; + globalOrigin.z()=-0.5; globalProportions.x()=size; globalProportions.y()=size; globalProportions.z()=size; @@ -645,17 +642,17 @@ class DistributedGirdTest_3D : public ::testing::Test distributedGrid=new DistributedGridType(); distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3, 3 ) ); - distributedGrid->template setGlobalGrid( globalGrid ); - distributedGrid->setupGrid(*gridptr); + distributedGrid->setGlobalGrid( globalGrid ); + distributedGrid->setupGrid(*gridptr); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); dof=new DofType(gridptr->template getEntitiesCount< Cell >()); - meshFunctionptr->bind(gridptr,*dof); + meshFunctionptr->bind(gridptr,*dof); constFunctionPtr->Number=rank; } @@ -697,17 +694,17 @@ TEST_F(DistributedGirdTest_3D, evaluateInteriorEntities) check_Boundary_3D(rank, *gridptr, *dof, -1); check_Overlap_3D(rank, *gridptr, *dof, -1); check_Inner_3D(rank, *gridptr, *dof, rank); -} +} TEST_F(DistributedGirdTest_3D, LinearFunctionTest) { - //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) + //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) setDof_3D(*dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionptr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionptr ); - + synchronizer.synchronize( *meshFunctionptr ); + int count =gridptr->template getEntitiesCount< Cell >(); for(int i=0;i #include #include #include @@ -17,13 +16,10 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedVectorFieldMPIIO{ public: @@ -33,8 +29,8 @@ class TestDistributedVectorFieldMPIIO{ typedef VectorField VectorFieldType; typedef Vector DofType; typedef typename MeshType::Cell Cell; - typedef typename MeshType::IndexType IndexType; - typedef typename MeshType::PointType PointType; + typedef typename MeshType::IndexType IndexType; + typedef typename MeshType::PointType PointType; typedef DistributedMesh DistributedGridType; typedef typename DistributedGridType::CoordinatesType CoordinatesType; @@ -43,8 +39,8 @@ class TestDistributedVectorFieldMPIIO{ static void TestSave() { Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; - MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; - + MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; + //save distributed meshfunction into file PointType globalOrigin; globalOrigin.setValue(-0.5); @@ -55,14 +51,14 @@ class TestDistributedVectorFieldMPIIO{ Pointers::SharedPointer globalGrid; globalGrid->setDimensions(globalProportions); globalGrid->setDomain(globalOrigin,globalProportions); - + DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); - Pointers::SharedPointer gridptr; + Pointers::SharedPointer gridptr; distributedGrid.setupGrid(*gridptr); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); distributedGrid.setupGrid(*gridptr); @@ -74,10 +70,10 @@ class TestDistributedVectorFieldMPIIO{ DofType dof(vctdim*(gridptr->template getEntitiesCount< Cell >())); dof.setValue(0); vectorField.bind(gridptr,dof); - + for(int i=0;i ::save(FileName, vectorField ); /*File file; @@ -86,7 +82,7 @@ class TestDistributedVectorFieldMPIIO{ file.close(); */ //first process compare results - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType globalEvaluatedDof(vctdim*(globalGrid->template getEntitiesCount< Cell >())); @@ -101,7 +97,7 @@ class TestDistributedVectorFieldMPIIO{ loadvct.bind(globalGrid,loadDof); loadDof.setValue(-1); - + File file; file.open( FileName, std::ios_base::in ); loadvct.boundLoad(file); @@ -111,13 +107,13 @@ class TestDistributedVectorFieldMPIIO{ } } }; - + static void TestLoad() { Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; - MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; + MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; - //Crete distributed grid + //Crete distributed grid PointType globalOrigin; globalOrigin.setValue(-0.5); @@ -131,26 +127,26 @@ class TestDistributedVectorFieldMPIIO{ CoordinatesType overlap; overlap.setValue(1); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid(*globalGrid); + distributedGrid.setGlobalGrid(*globalGrid); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); - String FileName=String("/tmp/test-file.tnl"); + String FileName=String("/tmp/test-file.tnl"); - //Prepare file - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) - { + //Prepare file + if(TNL::MPI::GetRank()==0) + { DofType saveDof(vctdim*(globalGrid->template getEntitiesCount< Cell >())); VectorFieldType saveVectorField; saveVectorField.bind(globalGrid,saveDof); for(int i=0;i loadGridptr; VectorFieldType loadVectorField; distributedGrid.setupGrid(*loadGridptr); - + DofType loadDof(vctdim*(loadGridptr->template getEntitiesCount< Cell >())); loadDof.setValue(0); loadVectorField.bind(loadGridptr,loadDof); @@ -169,26 +165,26 @@ class TestDistributedVectorFieldMPIIO{ synchronizer.setDistributedGrid( &distributedGrid ); for(int i=0;i(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof Pointers::SharedPointer evalGridPtr; VectorFieldType evalVectorField; distributedGrid.setupGrid(*evalGridPtr); - + DofType evalDof(vctdim*(evalGridPtr->template getEntitiesCount< Cell >())); evalDof.setValue(-1); evalVectorField.bind(evalGridPtr,evalDof); - + for(int i=0;i(*evalVectorField[i]); + linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr); + synchronizer.synchronize(*evalVectorField[i]); } for(int i=0;i Date: Sat, 2 Jan 2021 15:09:35 +0100 Subject: [PATCH 47/50] MPI refactoring: replaced DimsCreate and CreateNewGroup in MpiCommunicator with plain function wrappers --- src/TNL/Communicators/MpiCommunicator.h | 41 -------------------- src/TNL/MPI/DummyDefs.h | 12 ++++++ src/TNL/MPI/Wrappers.h | 50 +++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 41 deletions(-) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index c155cabbe..47392ca60 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -175,47 +175,6 @@ class MpiCommunicator MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group ); } - - //dim-number of dimensions, distr array of guess distr - 0 for computation - //distr array will be filled by computed distribution - //more information in MPI documentation - static void DimsCreate(int nproc, int dim, int *distr) - { -#ifdef HAVE_MPI - int sum = 0, prod = 1; - for( int i = 0;i < dim; i++ ) { - sum += distr[ i ]; - prod *= distr[ i ]; - } - if( prod != 0 && prod != GetSize( AllGroup ) ) - throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions." - "Non of the dimensions is zero and product of all dimensions does " - "not fit with number of MPI processes." ); - if(sum==0) { - for(int i=0;i +#include #ifdef HAVE_MPI #include @@ -156,6 +157,55 @@ inline int GetSize( MPI_Comm group = AllGroup() ) #endif } +// wrappers for MPI helper functions + +inline MPI_Comm Comm_split( MPI_Comm comm, int color, int key ) +{ +#ifdef HAVE_MPI + MPI_Comm newcomm; + MPI_Comm_split( comm, color, key, &newcomm ); + return newcomm; +#else + return comm; +#endif +} + +/** + * \brief Wrapper for \ref MPI_Dims_create. + * + * \param nproc - number of processes in the group to be distributed + * \param ndims - number of dimensions of the Cartesian grid + * \param dims - distribution of processes into the \e dim-dimensional + * Cartesian grid (array of length \e ndims) + * + * Negative input values of \e dims[i] are erroneous. An error will occur if + * \e nproc is not a multiple of the product of all non-zero values \e dims[i]. + * + * See the MPI documentation for more information. + */ +inline void Compute_dims( int nproc, int ndims, int* dims ) +{ +#ifdef HAVE_MPI + int prod = 1; + for( int i = 0; i < ndims; i++ ) { + if( dims[ i ] < 0 ) + throw std::invalid_argument( "Negative value passed to MPI::Compute_dims in the dims array argument." ); + if( dims[ i ] > 0 ) + prod *= dims[ i ]; + } + + if( nproc % prod != 0 ) + throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions." + "The product of the non-zero values dims[i] is " + std::to_string(prod) + " and the " + "number of processes (" + std::to_string(nproc) + ") is not a multiple of the product." ); + + MPI_Dims_create( nproc, ndims, dims ); +#else + for( int i = 0; i < ndims; i++) + dims[ i ] = 1; +#endif +} + // wrappers for MPI communication functions inline void Barrier( MPI_Comm group = AllGroup() ) -- GitLab From 60ee5cd078e372949d55b769f4a0c66c0c961f24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 2 Jan 2021 15:50:25 +0100 Subject: [PATCH 48/50] MPI refactoring: removed unit tests for MpiCommunicator and marked it as deprecated --- src/TNL/Communicators/MpiCommunicator.h | 3 +- src/UnitTests/CMakeLists.txt | 1 - src/UnitTests/Communicators/CMakeLists.txt | 9 ---- .../Communicators/MpiCommunicatorTest.cpp | 51 ------------------- .../DistributedVectorFieldIO_MPIIOTest.cpp | 5 -- src/UnitTests/main_mpi.h | 5 +- 6 files changed, 4 insertions(+), 70 deletions(-) delete mode 100644 src/UnitTests/Communicators/CMakeLists.txt delete mode 100644 src/UnitTests/Communicators/MpiCommunicatorTest.cpp diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 47392ca60..cd5162968 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -21,7 +21,8 @@ namespace Communicators { namespace { //! \brief MPI communicator. -class MpiCommunicator +class [[deprecated("use the functions in the TNL::MPI namespace instead")]] +MpiCommunicator { public: #ifdef HAVE_MPI diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt index 8e4ac7249..2c0ba8650 100644 --- a/src/UnitTests/CMakeLists.txt +++ b/src/UnitTests/CMakeLists.txt @@ -1,4 +1,3 @@ -ADD_SUBDIRECTORY( Communicators ) ADD_SUBDIRECTORY( Containers ) ADD_SUBDIRECTORY( Functions ) # Matrices are included from src/CMakeLists.txt diff --git a/src/UnitTests/Communicators/CMakeLists.txt b/src/UnitTests/Communicators/CMakeLists.txt deleted file mode 100644 index 1a3331c3a..000000000 --- a/src/UnitTests/Communicators/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -if( ${BUILD_MPI} ) - ADD_EXECUTABLE( MpiCommunicatorTest MpiCommunicatorTest.cpp ) - TARGET_COMPILE_OPTIONS( MpiCommunicatorTest PRIVATE ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( MpiCommunicatorTest ${GTEST_BOTH_LIBRARIES} ) - - SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MpiCommunicatorTest${CMAKE_EXECUTABLE_SUFFIX}" ) - ADD_TEST( NAME MpiCommunicatorTest COMMAND "mpirun" ${mpi_test_parameters}) - -endif() diff --git a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp b/src/UnitTests/Communicators/MpiCommunicatorTest.cpp deleted file mode 100644 index b78011953..000000000 --- a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/*************************************************************************** - MpiCommunicatorTest.h - description - ------------------- - begin : Jul 10, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef HAVE_GTEST - -#include "gtest/gtest.h" -#include - -using namespace TNL; -using namespace TNL::Communicators; - -// test fixture for typed tests -template< typename Real > -class MpiCommunicatorTest : public ::testing::Test -{ - protected: - using RealType = Real; - using CommunicatorType = MpiCommunicator; -}; - -// types for which MpiCommunicatorTest is instantiated -using MpiCommunicatorTypes = ::testing::Types< - short, - int, - long, - float, - double ->; - -TYPED_TEST_SUITE( MpiCommunicatorTest, MpiCommunicatorTypes ); - -TYPED_TEST( MpiCommunicatorTest, allReduce ) -{ - using RealType = typename TestFixture::RealType; - using CommunicatorType = typename TestFixture::CommunicatorType; - RealType a = CommunicatorType::GetRank(); - RealType b = 0; - CommunicatorType::Allreduce( &a, &b, 1, MPI_MAX, MPI_COMM_WORLD ); - EXPECT_EQ( b, CommunicatorType::GetSize() - 1 ); -} - -#endif // HAVE_GTEST - -#include "../main_mpi.h" \ No newline at end of file diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp index 0a5ab3e37..9bdccbcdb 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp @@ -2,13 +2,8 @@ #include #ifdef HAVE_MPI -#include #include "DistributedVectorFieldIO_MPIIOTestBase.h" -using namespace TNL::Communicators; - -typedef MpiCommunicator CommunicatorType; - TEST( DistributedVectorFieldIO_MPIIO, Save_1D ) { TestDistributedVectorFieldMPIIO<1,2,Host>::TestSave(); diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h index 4c89b60ba..d22f6d3eb 100644 --- a/src/UnitTests/main_mpi.h +++ b/src/UnitTests/main_mpi.h @@ -6,9 +6,8 @@ #endif #if (defined(HAVE_GTEST) && defined(HAVE_MPI)) -#include #include -using CommunicatorType = TNL::Communicators::MpiCommunicator; +#include #include @@ -37,7 +36,7 @@ public: // Called after a test ends. virtual void OnTestEnd(const ::testing::TestInfo& test_info) { - const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup); + const int rank = TNL::MPI::GetRank(); sout << test_info.test_case_name() << "." << test_info.name() << " End." < Date: Sat, 2 Jan 2021 15:52:57 +0100 Subject: [PATCH 49/50] MPI refactoring: cleaned up benchmarks --- src/Benchmarks/Benchmarks.h | 8 +++---- src/Benchmarks/ODESolvers/SimpleProblem.h | 10 ++++---- src/Benchmarks/ODESolvers/benchmarks.h | 29 +---------------------- 3 files changed, 8 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index cbd628b03..2b2389e2c 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include namespace TNL { namespace Benchmarks { @@ -55,7 +55,7 @@ struct BenchmarkResult elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; - else + else elements << "N/A"; return elements; } @@ -356,9 +356,7 @@ inline Benchmark::MetadataMap getHardwareMetadata() { "system release", SystemInfo::getSystemRelease() }, { "start time", SystemInfo::getCurrentTime() }, #ifdef HAVE_MPI - { "number of MPI processes", convertToString( (Communicators::MpiCommunicator::IsInitialized()) - ? Communicators::MpiCommunicator::GetSize( Communicators::MpiCommunicator::AllGroup ) - : 1 ) }, + { "number of MPI processes", convertToString( TNL::MPI::GetSize() ) }, #endif { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) }, { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) }, diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h index 122606a32..65f769dda 100644 --- a/src/Benchmarks/ODESolvers/SimpleProblem.h +++ b/src/Benchmarks/ODESolvers/SimpleProblem.h @@ -14,11 +14,10 @@ #include #include -#include namespace TNL { namespace Benchmarks { - + template< typename Real = double, typename Device = Devices::Host, typename Index = int > @@ -28,8 +27,7 @@ struct SimpleProblem using DeviceType = Device; using IndexType = Index; using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using CommunicatorType = Communicators::MpiCommunicator; - + template< typename VectorPointer > void getExplicitUpdate( const RealType& time, const RealType& tau, @@ -46,10 +44,10 @@ struct SimpleProblem }; Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu ); } - + template< typename Vector > void applyBoundaryConditions( const RealType& t, Vector& u ) {}; - + }; } // namespace Benchmarks diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h index 60b533663..a6ee67a62 100644 --- a/src/Benchmarks/ODESolvers/benchmarks.h +++ b/src/Benchmarks/ODESolvers/benchmarks.h @@ -16,8 +16,6 @@ #include #include "../Benchmarks.h" -#include "SimpleProblem.h" - #include // std::runtime_error @@ -35,31 +33,6 @@ getPerformer() return "CPU"; } -/*template< typename Matrix > -void barrier( const Matrix& matrix ) -{ -} - -template< typename Matrix, typename Communicator > -void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix ) -{ - Communicator::Barrier( matrix.getCommunicationGroup() ); -}*/ - -template< typename Device > -bool checkDevice( const Config::ParameterContainer& parameters ) -{ - const String device = parameters.getParameter< String >( "device" ); - if( device == "all" ) - return true; - if( std::is_same< Device, Devices::Host >::value && device == "host" ) - return true; - if( std::is_same< Device, Devices::Cuda >::value && device == "cuda" ) - return true; - return false; -} - - template< typename Solver, typename VectorPointer > void benchmarkSolver( Benchmark& benchmark, @@ -90,7 +63,7 @@ benchmarkSolver( Benchmark& benchmark, auto compute = [&]() { solver.solve( u ); }; - + // subclass BenchmarkResult to add extra columns to the benchmark // (iterations, preconditioned residue, true residue) /*struct MyBenchmarkResult : public BenchmarkResult -- GitLab From fb3807b59c1e2b0588e297c257c7238e51c4ef9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 23 Jan 2021 12:02:23 +0100 Subject: [PATCH 50/50] Fixed the index of tutorials --- Documentation/Tutorials/index.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md index 56a51cc22..55b92ad81 100644 --- a/Documentation/Tutorials/index.md +++ b/Documentation/Tutorials/index.md @@ -2,11 +2,10 @@ ## Tutorials -1. [Building applications with TNL](tutorial_building_applications_with_tnl.html) -2. [General concepts](tutorial_GeneralConcepts.html) -3. [Arrays](tutorial_Arrays.html) -4. [Vectors](tutorial_Vectors.html) -5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) -6. [For loops](tutorial_ForLoops.html) -7. [Cross-device pointers](tutorial_Pointers.html) -8. [Matrices](tutorial_Matrices.html) +1. [General concepts](tutorial_GeneralConcepts.html) +2. [Arrays](tutorial_Arrays.html) +3. [Vectors](tutorial_Vectors.html) +4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) +5. [For loops](tutorial_ForLoops.html) +6. [Cross-device pointers](tutorial_Pointers.html) +7. [Matrices](tutorial_Matrices.html) -- GitLab