From dfb0351720a8464031a327292ab4e0ce8b9b7e30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 1 Nov 2020 12:48:34 +0100
Subject: [PATCH 01/50] Merge the functionality of NoDistrCommunicator into
 MpiCommunicator

When the program is compiled without MPI support, or when run without
mpirun, the MpiCommunicator behaves just like NoDistrCommunicator.
Hence, we will not need to separate between the two classes.
---
 src/TNL/Communicators/MpiCommunicator.h | 36 +++++++++----------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 1382fb7a6..dd119e813 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -165,8 +165,6 @@ class MpiCommunicator
                Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
             }
          }
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -193,7 +191,7 @@ class MpiCommunicator
          MPI_Finalized(&finalized);
          return initialized && !finalized;
 #else
-         throw Exceptions::MPISupportMissing();
+         return true;
 #endif
       }
 
@@ -206,7 +204,7 @@ class MpiCommunicator
          MPI_Comm_rank(group,&rank);
          return rank;
 #else
-         throw Exceptions::MPISupportMissing();
+         return 0;
 #endif
       }
 
@@ -219,7 +217,7 @@ class MpiCommunicator
          MPI_Comm_size(group,&size);
          return size;
 #else
-         throw Exceptions::MPISupportMissing();
+         return 1;
 #endif
       }
 
@@ -252,7 +250,8 @@ class MpiCommunicator
 
          MPI_Dims_create(nproc, dim, distr);
 #else
-         throw Exceptions::MPISupportMissing();
+         for(int i=0;i<dim;i++)
+            distr[i]=1;
 #endif
       }
 
@@ -262,8 +261,6 @@ class MpiCommunicator
          TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
          TNL_ASSERT_NE(group, NullGroup, "Barrier cannot be called with NullGroup");
          MPI_Barrier(group);
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -274,8 +271,6 @@ class MpiCommunicator
          TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
          TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup");
          MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group );
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -287,8 +282,6 @@ class MpiCommunicator
          TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup");
          MPI_Status status;
          MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status );
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
      }
 
@@ -302,7 +295,7 @@ class MpiCommunicator
          MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req);
          return req;
 #else
-         throw Exceptions::MPISupportMissing();
+         return 1;
 #endif
       }
 
@@ -316,7 +309,7 @@ class MpiCommunicator
          MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req);
          return req;
 #else
-         throw Exceptions::MPISupportMissing();
+         return 1;
 #endif
       }
 
@@ -325,8 +318,6 @@ class MpiCommunicator
 #ifdef HAVE_MPI
          TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
          MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE);
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -337,8 +328,6 @@ class MpiCommunicator
          TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
          TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
          MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group);
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -353,7 +342,7 @@ class MpiCommunicator
          TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
          MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group);
 #else
-         throw Exceptions::MPISupportMissing();
+         memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
 #endif
       }
 
@@ -367,8 +356,6 @@ class MpiCommunicator
 #ifdef HAVE_MPI
          TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
          MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group);
-#else
-         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -385,7 +372,7 @@ class MpiCommunicator
          TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup");
          MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group);
 #else
-         throw Exceptions::MPISupportMissing();
+         memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) );
 #endif
       }
 
@@ -437,7 +424,8 @@ class MpiCommunicator
                        MPITypeResolver< T >::getType(),
                        group );
 #else
-         throw Exceptions::MPISupportMissing();
+         TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
+         memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) );
 #endif
       }
 
@@ -458,7 +446,7 @@ class MpiCommunicator
          else
             MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup);
 #else
-         throw Exceptions::MPISupportMissing();
+         newGroup=oldGroup;
 #endif
       }
 
-- 
GitLab


From 3bddf4139bfa72f4bba3f4e865638757ec32fe0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 1 Nov 2020 12:58:55 +0100
Subject: [PATCH 02/50] Removed useless uses of NoDistrCommunicator

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |  5 --
 .../tnl-benchmark-linear-solvers.h            |  5 --
 src/Benchmarks/ODESolvers/Euler.hpp           |  2 -
 src/Benchmarks/ODESolvers/Merson.hpp          |  2 -
 src/Benchmarks/ODESolvers/SimpleProblem.h     |  3 +-
 .../ODESolvers/tnl-benchmark-ode-solvers.h    |  5 --
 src/TNL/Solvers/Linear/Traits.h               |  4 +-
 src/TNL/Solvers/ODE/Merson_impl.h             |  1 -
 src/TNL/Solvers/SolverInitiator.h             |  2 +-
 src/TNL/Solvers/SolverInitiator_impl.h        | 39 ++-------
 src/TNL/Solvers/SolverStarter_impl.h          |  2 -
 src/Tools/tnl-init.h                          | 87 ++++++++-----------
 12 files changed, 48 insertions(+), 109 deletions(-)

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 683e6960a..b791b0100 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -20,7 +20,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -39,11 +38,7 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index,
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-#ifdef HAVE_MPI
 using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
 
 
 template< typename Matrix, typename Vector >
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index e5a8d9819..cadb5a046 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -25,7 +25,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -66,11 +65,7 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-#ifdef HAVE_MPI
 using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
 
 
 static const std::set< std::string > valid_solvers = {
diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp
index ab975ed07..5039417b7 100644
--- a/src/Benchmarks/ODESolvers/Euler.hpp
+++ b/src/Benchmarks/ODESolvers/Euler.hpp
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include "ComputeBlockResidue.h"
 
 namespace TNL {
diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp
index c97bfc236..1fd8f8a2b 100644
--- a/src/Benchmarks/ODESolvers/Merson.hpp
+++ b/src/Benchmarks/ODESolvers/Merson.hpp
@@ -13,8 +13,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 
 #include "Merson.h"
 
diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h
index ff81fd18e..122606a32 100644
--- a/src/Benchmarks/ODESolvers/SimpleProblem.h
+++ b/src/Benchmarks/ODESolvers/SimpleProblem.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -27,7 +28,7 @@ struct SimpleProblem
    using DeviceType = Device;
    using IndexType = Index;
    using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >;
-   using CommunicatorType = Communicators::NoDistrCommunicator;
+   using CommunicatorType = Communicators::MpiCommunicator;
    
    template< typename VectorPointer >
    void getExplicitUpdate( const RealType& time,
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index bbde88945..aa4370c7a 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -24,7 +24,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
@@ -39,11 +38,7 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-#ifdef HAVE_MPI
 using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
 
 
 template< typename Real, typename Index >
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 9a5db2c40..5f93e0cde 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -26,7 +26,7 @@ namespace Linear {
 template< typename Matrix >
 struct Traits
 {
-   using CommunicatorType = Communicators::NoDistrCommunicator;
+   using CommunicatorType = Communicators::MpiCommunicator;
 
    using VectorType = Containers::Vector
          < typename Matrix::RealType,
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 4c7b21bc9..82a6a87ff 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -14,7 +14,6 @@
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 
 #include "Merson.h"
 
diff --git a/src/TNL/Solvers/SolverInitiator.h b/src/TNL/Solvers/SolverInitiator.h
index 0ba4dc55a..062857520 100644
--- a/src/TNL/Solvers/SolverInitiator.h
+++ b/src/TNL/Solvers/SolverInitiator.h
@@ -16,7 +16,7 @@
 namespace TNL {
 namespace Solvers {
 
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter , typename CommunicatorType  > class ProblemSetter,
+template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
           typename ConfigTag >
 class SolverInitiator
 {
diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h
index 16e0fd222..3d704426d 100644
--- a/src/TNL/Solvers/SolverInitiator_impl.h
+++ b/src/TNL/Solvers/SolverInitiator_impl.h
@@ -18,7 +18,6 @@
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Meshes/DummyMesh.h>
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
@@ -50,15 +49,6 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Device,
           typename Index,
           typename ConfigTag,
-          bool enabled = true  >
-class CommunicatorTypeResolver {};
-
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename ConfigTag,
-          typename CommunicatorType,
           bool enabled = ConfigTagMeshResolve< ConfigTag >::enabled >
 class SolverInitiatorMeshResolver {};
 
@@ -169,7 +159,7 @@ class SolverInitiatorIndexResolver< ProblemSetter, Real, Device, Index, ConfigTa
    public:
       static bool run( const Config::ParameterContainer& parameters )
       {
-         return CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::run( parameters );
+         return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag >::run( parameters );
       }
 };
 
@@ -178,28 +168,12 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Device,
           typename Index,
           typename ConfigTag >
-class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
-{
-   public:
-      static bool run( const Config::ParameterContainer& parameters )
-      {
-         if( Communicators::MpiCommunicator::isDistributed() )
-            return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters );
-         return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters );
-      }
-};
-
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename ConfigTag,
-          typename CommunicatorType >
-class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, CommunicatorType, false >
+class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, false >
 {
    public:
       static bool run( const Config::ParameterContainer& parameters )
       {
+         using CommunicatorType = Communicators::MpiCommunicator;
          return ProblemSetter< Real,
                                Device,
                                Index,
@@ -213,10 +187,11 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Real,
           typename Device,
           typename Index,
-          typename ConfigTag,
-          typename CommunicatorType >
-class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag,CommunicatorType, true >
+          typename ConfigTag >
+class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
 {
+   using CommunicatorType = Communicators::MpiCommunicator;
+
    // wrapper for MeshTypeResolver
    template< typename MeshType >
    using ProblemSetterWrapper = ProblemSetter< Real, Device, Index, MeshType, ConfigTag, SolverStarter< ConfigTag >, CommunicatorType >;
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index d2bbd8159..fa1d23951 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -14,7 +14,6 @@
 #include <TNL/String.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Devices/Host.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Solvers/BuildConfigTags.h>
@@ -66,7 +65,6 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame
     */
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! Communicators::NoDistrCommunicator::setup( parameters ) ||
        ! Communicators::MpiCommunicator::setup( parameters ) 
     )
       return false;
diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h
index a0d171f14..8a4024ac6 100644
--- a/src/Tools/tnl-init.h
+++ b/src/Tools/tnl-init.h
@@ -21,19 +21,18 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 using namespace TNL;
 
 template< typename MeshType,
           typename RealType,
-          typename CommunicatorType,
           int xDiff,
           int yDiff,
           int zDiff >
 bool renderFunction( const Config::ParameterContainer& parameters )
 {
+   using CommunicatorType = Communicators::MpiCommunicator;
 
    using namespace  Meshes::DistributedMeshes;
    using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
@@ -130,20 +129,6 @@ bool renderFunction( const Config::ParameterContainer& parameters )
    return true;
 }
 
-template< typename MeshType,
-          typename RealType,
-          int xDiff,
-          int yDiff,
-          int zDiff >
-bool resolveCommunicator( const Config::ParameterContainer& parameters )
-{
-#ifdef HAVE_MPI
-   if( Communicators::MpiCommunicator::isDistributed() )
-      return renderFunction<MeshType,RealType, Communicators::MpiCommunicator,xDiff,yDiff,zDiff>(parameters);
-#endif
-   return renderFunction<MeshType,RealType, Communicators::NoDistrCommunicator,xDiff,yDiff,zDiff>(parameters);
-}
-
 template< typename MeshType,
           typename RealType >
 bool resolveDerivatives( const Config::ParameterContainer& parameters )
@@ -160,75 +145,75 @@ bool resolveDerivatives( const Config::ParameterContainer& parameters )
       return false;
    }
    if( xDiff == 0 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 0 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 1 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 2 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 3 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 4 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 4 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 4 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 0 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 1 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 2 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 3 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 0 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 1 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 2 >( parameters );
    if( xDiff == 0 && yDiff == 3 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 3, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 3, 0 >( parameters );
    if( xDiff == 0 && yDiff == 3 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 3, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 3, 1 >( parameters );
    if( xDiff == 0 && yDiff == 4 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 4, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 4, 0 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 0 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 1 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 2 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 3 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 0 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 1 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 2 >( parameters );
    if( xDiff == 1 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 2, 0 >( parameters );
    if( xDiff == 1 && yDiff == 2 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 2, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 2, 1 >( parameters );
    if( xDiff == 1 && yDiff == 3 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 3, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 3, 0 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 0 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 1 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 2 >( parameters );
    if( xDiff == 2 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 1, 0 >( parameters );
    if( xDiff == 2 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 2, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 1, 1 >( parameters );
    if( xDiff == 2 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 2, 0 >( parameters );
    if( xDiff == 3 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 3, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 0, 0 >( parameters );
    if( xDiff == 3 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 3, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 0, 1 >( parameters );
    if( xDiff == 3 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 3, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 1, 0 >( parameters );
    if( xDiff == 4 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 4, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 4, 0, 0 >( parameters );
    return false;
 }
 
-- 
GitLab


From fc2a84a72460b5f39e026682352060b07b17ed36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 1 Nov 2020 13:40:00 +0100
Subject: [PATCH 03/50] Replaced NoDistrCommunicator in unit tests with
 MpiCommunicator and separate run without mpirun

---
 src/UnitTests/Containers/CMakeLists.txt       |  9 ++++++++
 .../Containers/DistributedArrayTest.h         |  7 ++----
 .../Containers/DistributedVectorTest.h        |  7 ++----
 .../Containers/VectorBinaryOperationsTest.h   | 22 ++-----------------
 .../Containers/VectorUnaryOperationsTest.h    | 11 ++--------
 .../Containers/VectorVerticalOperationsTest.h | 11 ++--------
 .../Containers/ndarray/CMakeLists.txt         |  4 ++++
 .../DistributedNDArrayOverlaps_1D_test.h      | 15 -------------
 .../DistributedNDArrayOverlaps_semi1D_test.h  |  1 -
 .../ndarray/DistributedNDArray_1D_test.h      | 15 ++-----------
 .../ndarray/DistributedNDArray_semi1D_test.h  |  3 +--
 src/UnitTests/Matrices/CMakeLists.txt         |  1 +
 .../Matrices/DistributedMatrixTest.h          |  7 ++----
 .../DistributedMeshes/CutMeshFunctionTest.cpp |  8 +++----
 14 files changed, 33 insertions(+), 88 deletions(-)

diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index fdde0a8b7..efba5e50d 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -92,30 +92,39 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorUnaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorUnaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorVerticalOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    if( BUILD_CUDA )
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
    endif()
 endif()
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index 204bc6fe7..097a60d26 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Containers/DistributedArray.h>
 #include <TNL/Containers/Partitioner.h>
 
@@ -59,12 +58,10 @@ protected:
 
 // types for which DistributedArrayTest is instantiated
 using DistributedArrayTypes = ::testing::Types<
-   DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >,
-   DistributedArray< double, Devices::Host, int, Communicators::NoDistrCommunicator >
+   DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >
 #ifdef HAVE_CUDA
    ,
-   DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-   DistributedArray< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+   DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >
 #endif
 >;
 
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index 2a1834f31..1d727aef6 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -12,7 +12,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Containers/DistributedVectorView.h>
 #include <TNL/Containers/Partitioner.h>
@@ -69,12 +68,10 @@ protected:
 
 // types for which DistributedVectorTest is instantiated
 using DistributedVectorTypes = ::testing::Types<
-   DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >,
-   DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >
+   DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >
 #ifdef HAVE_CUDA
    ,
-   DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-   DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+   DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >
 #endif
 >;
 
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index d09798453..7f81d87f5 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -14,7 +14,6 @@
 
 #if defined(DISTRIBUTED_VECTOR)
    #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
@@ -154,16 +153,7 @@ protected:
       Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
             DistributedVector<     short, Devices::Host, int, Communicators::MpiCommunicator > >,
       Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >,
-
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >
+            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >
    #else
       Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
             DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
@@ -172,15 +162,7 @@ protected:
       Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
             DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
       Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >
+            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index a5beb58d9..867adb069 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -14,7 +14,6 @@
 
 #if defined(DISTRIBUTED_VECTOR)
    #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
@@ -70,17 +69,11 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
       DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator >
+      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >
    #else
       DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
       DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index 3aa60e612..ac7fa79d6 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -14,7 +14,6 @@
 
 #if defined(DISTRIBUTED_VECTOR)
    #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
@@ -106,17 +105,11 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
       DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator >
+      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >
    #else
       DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
       DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index 5be285b5e..f5fb11bdf 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -58,13 +58,17 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArray_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArray_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 7377cbff2..113d1daa3 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
@@ -74,13 +73,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 Devices::Host >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
-// TODO: does it make sense for NoDistrCommunicator?
-//   DistributedNDArray< NDArray< double,
-//                                SizesHolder< int, 0 >,
-//                                std::index_sequence< 0 >,
-//                                Devices::Host >,
-//                       Communicators::NoDistrCommunicator,
-//                       std::index_sequence< 2 > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
@@ -89,13 +81,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 Devices::Cuda >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
-// TODO: does it make sense for NoDistrCommunicator?
-//   DistributedNDArray< NDArray< double,
-//                                SizesHolder< int, 0 >,
-//                                std::index_sequence< 0 >,
-//                                Devices::Cuda >,
-//                       Communicators::NoDistrCommunicator,
-//                       std::index_sequence< 2 > >
 #endif
 >;
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index f1ac970eb..145b0db5b 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index a8d3bcdab..d80e467f5 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
@@ -70,24 +69,14 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Host >,
-                       Communicators::MpiCommunicator >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Host >,
-                       Communicators::NoDistrCommunicator >
+                       Communicators::MpiCommunicator >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Cuda >,
-                       Communicators::MpiCommunicator >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Cuda >,
-                       Communicators::NoDistrCommunicator >
+                       Communicators::MpiCommunicator >
 #endif
 >;
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index 6f777c215..a072b2e80 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
@@ -77,7 +76,7 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Cuda >,
-                       Communicators::NoDistrCommunicator >
+                       Communicators::MpiCommunicator >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 65723ac88..b713c8f0c 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -58,4 +58,5 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedMatrixTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedMatrixTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" )
 endif()
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index 30a76f86a..ea5a7e582 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Matrices/SparseMatrix.h>
@@ -101,12 +100,10 @@ protected:
 
 // types for which DistributedMatrixTest is instantiated
 using DistributedMatrixTypes = ::testing::Types<
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >
 #ifdef HAVE_CUDA
    ,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >
 #endif
 >;
 
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
index 5d034087f..640aa5180 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
@@ -5,7 +5,7 @@
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Meshes/Grid.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/MpiCommunicator.h>
 
 #include "../../Functions/Functions.h"
 
@@ -53,7 +53,7 @@ TEST(CutMeshFunction, 2D)
    //Prepare Mesh Function parts for Cut 
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MpiCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof, 
             StaticVector<1,int>(0),
             StaticVector<1,int>(1),
@@ -116,7 +116,7 @@ TEST(CutMeshFunction, 3D_1)
    //Prepare Mesh Function parts for Cut 
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MpiCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof, 
             StaticVector<1,int>(1),
             StaticVector<2,int>(0,2),
@@ -179,7 +179,7 @@ TEST(CutMeshFunction, 3D_2)
    //Prepare Mesh Function parts for Cut 
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MpiCommunicator, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof, 
             StaticVector<2,int>(2,1),
             StaticVector<1,int>(0),
-- 
GitLab


From 1373faf57f97642050f6a61051f91bc171f3e82a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 1 Nov 2020 13:41:39 +0100
Subject: [PATCH 04/50] Removed NoDistrCommunicator which is now unused

---
 src/TNL/Communicators/NoDistrCommunicator.h | 149 --------------------
 1 file changed, 149 deletions(-)
 delete mode 100644 src/TNL/Communicators/NoDistrCommunicator.h

diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h
deleted file mode 100644
index c0d89015b..000000000
--- a/src/TNL/Communicators/NoDistrCommunicator.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************
-                          NoDistrCommunicator.h  -  description
-                             -------------------
-    begin                : Jan 9, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Logger.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Communicators/MpiDefs.h>
-
-namespace TNL {
-namespace Communicators {
-
-//! \brief Dummy communicator without any distribution support.
-class NoDistrCommunicator
-{
-   public:
-      using Request = int;
-      using CommunicationGroup = int;
-      static constexpr Request NullRequest = -1;
-      static constexpr CommunicationGroup AllGroup = 1;
-      static constexpr CommunicationGroup NullGroup = 0;
-
-      static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ){};
-
-      static bool setup( const Config::ParameterContainer& parameters,
-                         const String& prefix = "" )
-      {
-         return true;
-      }
-
-      static void Init(int& argc, char**& argv) {}
-
-      static void setupRedirection(){}
-
-      static void Finalize(){}
-
-      static bool IsInitialized()
-      {
-          return true;
-      }
-
-      static bool isDistributed()
-      {
-          return false;
-      }
-
-      static int GetRank(CommunicationGroup group = AllGroup )
-      {
-          return 0;
-      }
-
-      static int GetSize(CommunicationGroup group = AllGroup )
-      {
-          return 1;
-      }
-
-      static void DimsCreate(int nproc, int dim, int *distr)
-      {
-          for(int i=0;i<dim;i++)
-          {
-              distr[i]=1;
-          }
-      }
-
-      static void Barrier(CommunicationGroup group = AllGroup)
-      {
-      }
-
-      template <typename T>
-      static Request ISend( const T *data, int count, int dest, int tag, CommunicationGroup group)
-      {
-          return 1;
-      }
-
-      template <typename T>
-      static Request IRecv( const T *data, int count, int src, int tag, CommunicationGroup group)
-      {
-          return 1;
-      }
-
-      static void WaitAll(Request *reqs, int length)
-      {
-      }
-
-      template< typename T >
-      static void Bcast( T* data, int count, int root, CommunicationGroup group)
-      {
-      }
-
-      template< typename T >
-      static void Allreduce( const T* data,
-                             T* reduced_data,
-                             int count,
-                             const MPI_Op &op,
-                             CommunicationGroup group )
-      {
-         memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
-      }
-
-      // in-place variant of Allreduce
-      template< typename T >
-      static void Allreduce( T* data,
-                             int count,
-                             const MPI_Op &op,
-                             CommunicationGroup group )
-      {
-      }
-
-      template< typename T >
-      static void Reduce( T* data,
-                          T* reduced_data,
-                          int count,
-                          MPI_Op &op,
-                          int root,
-                          CommunicationGroup group )
-      {
-         memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) );
-      }
-
-      template< typename T >
-      static void Alltoall( const T* sendData,
-                            int sendCount,
-                            T* receiveData,
-                            int receiveCount,
-                            CommunicationGroup group )
-      {
-         TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount for NoDistrCommunicator." );
-         memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) );
-      }
-
-      static void CreateNewGroup(bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup)
-      {
-         newGroup=oldGroup;
-      }
-
-      static void writeProlog( Logger& logger )
-      {
-      }
-};
-
-} // namespace Communicators
-} // namespace TNL
-- 
GitLab


From f30d68cfa09bc8e43de034a989fcfac6712865c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 2 Nov 2020 20:59:18 +0100
Subject: [PATCH 05/50] PVTUReader: added methods readLocalPointData and
 readLocalCellData

---
 src/TNL/Meshes/Readers/PVTUReader.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h
index 666aa4f45..4bb8ba7eb 100644
--- a/src/TNL/Meshes/Readers/PVTUReader.h
+++ b/src/TNL/Meshes/Readers/PVTUReader.h
@@ -211,6 +211,18 @@ public:
       mesh.setCommunicationGroup( group );
    }
 
+   VariantVector
+   readLocalPointData( std::string arrayName )
+   {
+      return localReader.readPointData( arrayName );
+   }
+
+   VariantVector
+   readLocalCellData( std::string arrayName )
+   {
+      return localReader.readCellData( arrayName );
+   }
+
    virtual void reset() override
    {
       resetBase();
-- 
GitLab


From d798788eddd21bfbfab6dda2acf47fb21d56a4b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 2 Nov 2020 21:17:37 +0100
Subject: [PATCH 06/50] pytnl: added bindings for DistributedMesh and PVTU mesh
 reader

---
 src/Python/pytnl/CMakeLists.txt               |  3 ++
 src/Python/pytnl/tnl/CMakeLists.txt           |  1 +
 src/Python/pytnl/tnl/Mesh.cpp                 | 25 ---------
 src/Python/pytnl/tnl/MeshReaders.cpp          | 37 +++++++++++++
 src/Python/pytnl/tnl/MeshReaders.h            | 47 ++++++++++++++++
 src/Python/pytnl/tnl/tnl.cpp                  |  2 +
 src/Python/pytnl/tnl_conversions.h            |  1 +
 src/Python/pytnl/tnl_mpi/CMakeLists.txt       | 54 +++++++++++++++++++
 src/Python/pytnl/tnl_mpi/DistributedMesh.cpp  | 20 +++++++
 src/Python/pytnl/tnl_mpi/DistributedMesh.h    | 34 ++++++++++++
 .../pytnl/tnl_mpi/DistributedMeshReaders.cpp  | 26 +++++++++
 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp          | 42 +++++++++++++++
 src/Python/pytnl/typedefs.h                   |  5 ++
 src/Python/pytnl/variant_caster.h             | 15 ++++++
 14 files changed, 287 insertions(+), 25 deletions(-)
 create mode 100644 src/Python/pytnl/tnl/MeshReaders.cpp
 create mode 100644 src/Python/pytnl/tnl/MeshReaders.h
 create mode 100644 src/Python/pytnl/tnl_mpi/CMakeLists.txt
 create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
 create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMesh.h
 create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
 create mode 100644 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
 create mode 100644 src/Python/pytnl/variant_caster.h

diff --git a/src/Python/pytnl/CMakeLists.txt b/src/Python/pytnl/CMakeLists.txt
index 2065b0a13..15b8e6b0a 100644
--- a/src/Python/pytnl/CMakeLists.txt
+++ b/src/Python/pytnl/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_subdirectory( tnl )
+if( BUILD_MPI )
+   add_subdirectory( tnl_mpi )
+endif()
 
 install( DIRECTORY . DESTINATION "include/pytnl"
          MESSAGE_NEVER
diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index c7fcd80e2..9c95d6326 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -6,6 +6,7 @@ set( sources
       Grid2D.cpp
       Grid3D.cpp
       Mesh.cpp
+      MeshReaders.cpp
       Object.cpp
       SparseMatrix.cpp
       String.cpp
diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp
index aa0c8c035..a3e582680 100644
--- a/src/Python/pytnl/tnl/Mesh.cpp
+++ b/src/Python/pytnl/tnl/Mesh.cpp
@@ -2,35 +2,10 @@
 #include "../tnl_conversions.h"
 
 #include "Mesh.h"
-#include <TNL/Meshes/Readers/VTKReader.h>
-#include <TNL/Meshes/Readers/VTUReader.h>
-
-template< typename Reader >
-void export_reader( py::module & m, const char* name )
-{
-    py::class_< Reader >( m, name )
-        .def(py::init<std::string>())
-        .def("loadMesh", &Reader::template loadMesh< MeshOfEdges >)
-        .def("loadMesh", &Reader::template loadMesh< MeshOfTriangles >)
-        .def("loadMesh", &Reader::template loadMesh< MeshOfTetrahedrons >)
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfEdges & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTriangles & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTetrahedrons & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-    ;
-}
 
 void export_Meshes( py::module & m )
 {
     export_Mesh< MeshOfEdges >( m, "MeshOfEdges" );
     export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" );
     export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" );
-
-    export_reader< TNL::Meshes::Readers::VTKReader >( m, "VTKReader" );
-    export_reader< TNL::Meshes::Readers::VTUReader >( m, "VTUReader" );
 }
diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp
new file mode 100644
index 000000000..d47ec5268
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshReaders.cpp
@@ -0,0 +1,37 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "MeshReaders.h"
+#include "../typedefs.h"
+
+void export_MeshReaders( py::module & m )
+{
+    using MeshReader = TNL::Meshes::Readers::MeshReader;
+    using XMLVTK = TNL::Meshes::Readers::XMLVTK;
+
+    // base class with trampolines for virtual methods
+    py::class_< MeshReader, PyMeshReader >( m, "MeshReader" )
+        .def(py::init<std::string>())
+        // bindings against the actual class, NOT the trampoline
+        .def("reset", &MeshReader::reset)
+        .def("detectMesh", &MeshReader::detectMesh)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >)
+    ;
+
+    py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" )
+        .def(py::init<std::string>())
+    ;
+
+    // base class for VTUReader and PVTUReader
+    py::class_< XMLVTK, PyXMLVTK, MeshReader >( m, "XMLVTK" )
+        .def(py::init<std::string>())
+        .def("readPointData", &XMLVTK::readPointData)
+        .def("readCellData", &XMLVTK::readCellData)
+   ;
+
+    py::class_< TNL::Meshes::Readers::VTUReader, XMLVTK >( m, "VTUReader" )
+        .def(py::init<std::string>())
+    ;
+}
diff --git a/src/Python/pytnl/tnl/MeshReaders.h b/src/Python/pytnl/tnl/MeshReaders.h
new file mode 100644
index 000000000..22b40a671
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshReaders.h
@@ -0,0 +1,47 @@
+#include <TNL/Meshes/Readers/VTKReader.h>
+#include <TNL/Meshes/Readers/VTUReader.h>
+
+// trampoline classes needed for overriding virtual methods
+// https://pybind11.readthedocs.io/en/stable/advanced/classes.html
+
+class PyMeshReader
+: public TNL::Meshes::Readers::MeshReader
+{
+   using Parent = TNL::Meshes::Readers::MeshReader;
+
+public:
+   // inherit constructors
+   using TNL::Meshes::Readers::MeshReader::MeshReader;
+
+   // trampolines (one for each virtual method)
+   void reset() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, reset );
+   }
+
+   void detectMesh() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh );
+   }
+};
+
+class PyXMLVTK
+: public TNL::Meshes::Readers::XMLVTK
+{
+   using Parent = TNL::Meshes::Readers::XMLVTK;
+
+public:
+   // inherit constructors
+   using TNL::Meshes::Readers::XMLVTK::XMLVTK;
+
+   // trampolines (one for each virtual method)
+   void reset() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, reset );
+   }
+
+   void detectMesh() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh );
+   }
+};
diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp
index 0eb7c3e8b..b60b98d68 100644
--- a/src/Python/pytnl/tnl/tnl.cpp
+++ b/src/Python/pytnl/tnl/tnl.cpp
@@ -14,6 +14,7 @@ void export_Grid1D( py::module & m );
 void export_Grid2D( py::module & m );
 void export_Grid3D( py::module & m );
 void export_Meshes( py::module & m );
+void export_MeshReaders( py::module & m );
 void export_SparseMatrices( py::module & m );
 
 template< typename T >
@@ -42,6 +43,7 @@ PYBIND11_MODULE(tnl, m)
     export_Grid3D(m);
 
     export_Meshes(m);
+    export_MeshReaders(m);
 
     export_SparseMatrices(m);
 }
diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h
index 602d1cffd..e942db324 100644
--- a/src/Python/pytnl/tnl_conversions.h
+++ b/src/Python/pytnl/tnl_conversions.h
@@ -1,3 +1,4 @@
 // conversion has to be registered for each object file
 #include "tnl_str_conversion.h"
 #include "tnl_tuple_conversion.h"
+#include "variant_caster.h"
diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
new file mode 100644
index 000000000..8ea9a70c1
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
@@ -0,0 +1,54 @@
+# enable C++14 for pytnl_mpi (due to py::overload_cast)
+set(PYBIND11_CPP_STANDARD -std=c++14)
+
+set( sources
+      DistributedMesh.cpp
+      DistributedMeshReaders.cpp
+      tnl_mpi.cpp
+)
+pybind11_add_module( pytnl_mpi ${sources} )
+
+# rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so
+set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi )
+
+# Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for
+# a specific architecture is not very useful and prevents using Python tools on
+# hybrid clusters.
+get_target_property( pytnl_mpi_COMPILE_OPTIONS pytnl_mpi COMPILE_OPTIONS )
+if( pytnl_mpi_COMPILE_OPTIONS )
+   string( REPLACE "-march=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+   string( REPLACE "-mtune=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+   set_target_properties( pytnl_mpi PROPERTIES COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+endif()
+
+# We have bindings for unsafe objects (e.g. Array::operator[]) where assertion
+# is the only safeguard, so we need to translate the TNL::AssertionError to
+# Python's AssertionError.
+# NDEBUG is defined in the global CMAKE_CXX_FLAGS and cannot be easily removed
+# per-target, so we need to undefine it by passing -U NDEBUG.
+target_compile_options( pytnl_mpi PRIVATE -U NDEBUG -D TNL_THROW_ASSERTION_ERROR )
+
+# disable errors due to -Wunused-value coming from pybind11
+if( ${WITH_CI_FLAGS} )
+   if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      target_compile_options( pytnl_mpi PRIVATE -Wno-error=unused-value )
+   endif()
+endif()
+
+
+# enable zlib and tinyxml2 (used by PVTUReader)
+find_package( ZLIB )
+if( ZLIB_FOUND )
+   target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_ZLIB")
+   target_include_directories(pytnl_mpi PUBLIC ${ZLIB_INCLUDE_DIRS})
+   target_link_libraries(pytnl_mpi PUBLIC ${ZLIB_LIBRARIES})
+endif()
+
+find_package( tinyxml2 QUIET )
+if( tinyxml2_FOUND )
+   target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_TINYXML2")
+   target_link_libraries(pytnl_mpi PUBLIC tinyxml2::tinyxml2)
+endif()
+
+
+install( TARGETS pytnl_mpi DESTINATION ${PYTHON_SITE_PACKAGES_DIR} )
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
new file mode 100644
index 000000000..253c1a9d4
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
@@ -0,0 +1,20 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../typedefs.h"
+#include "DistributedMesh.h"
+#include "../tnl/Array.h"
+
+void export_DistributedMeshes( py::module & m )
+{
+    // make sure that bindings for the local meshes are available
+    py::module_::import("tnl");
+
+    export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" );
+    export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" );
+    export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" );
+
+    // export VTKTypesArrayType
+    using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType;
+    export_Array< VTKTypesArrayType >(m, "VTKTypesArrayType");
+}
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.h b/src/Python/pytnl/tnl_mpi/DistributedMesh.h
new file mode 100644
index 000000000..64afe5978
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+template< typename Mesh >
+void export_DistributedMesh( py::module & m, const char* name )
+{
+    auto mesh = py::class_< Mesh >( m, name )
+        .def(py::init<>())
+        .def_static("getMeshDimension", &Mesh::getMeshDimension)
+//        .def("setCommunicationGroup", &Mesh::setCommunicationGroup)
+//        .def("getCommunicationGroup", &Mesh::getCommunicationGroup)
+        .def("getLocalMesh", py::overload_cast<>(&Mesh::getLocalMesh), py::return_value_policy::reference_internal)
+        .def("setGhostLevels", &Mesh::setGhostLevels)
+        .def("getGhostLevels", &Mesh::getGhostLevels)
+        .def("getGlobalPointIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& {
+                return mesh.template getGlobalIndices< 0 >();
+            },
+            py::return_value_policy::reference_internal)
+        .def("getGlobalCellIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& {
+                return mesh.template getGlobalIndices< Mesh::getMeshDimension() >();
+            },
+            py::return_value_policy::reference_internal)
+        .def("vtkPointGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& {
+                return mesh.vtkPointGhostTypes();
+            },
+            py::return_value_policy::reference_internal)
+        .def("vtkCellGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& {
+                return mesh.vtkCellGhostTypes();
+            },
+            py::return_value_policy::reference_internal)
+    ;
+}
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
new file mode 100644
index 000000000..bb902b5bc
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
@@ -0,0 +1,26 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../tnl/MeshReaders.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/PVTUReader.h>
+
+void export_DistributedMeshReaders( py::module & m )
+{
+    using XMLVTK = TNL::Meshes::Readers::XMLVTK;
+    using PVTUReader = TNL::Meshes::Readers::PVTUReader;
+
+    // make sure that bindings for the parent class are available
+    py::module_::import("tnl");
+
+    py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" )
+        .def(py::init<std::string>())
+        // loadMesh is not virtual in PVTUReader
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >)
+        .def("readLocalPointData", &PVTUReader::readLocalPointData)
+        .def("readLocalCellData", &PVTUReader::readLocalCellData)
+    ;
+}
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
new file mode 100644
index 000000000..6d9986a7a
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -0,0 +1,42 @@
+#include "../exceptions.h"
+#include "../typedefs.h"
+
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+// external functions
+void export_DistributedMeshes( py::module & m );
+void export_DistributedMeshReaders( py::module & m );
+
+#include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
+
+// Python module definition
+PYBIND11_MODULE(tnl_mpi, m)
+{
+    register_exceptions(m);
+
+    // MPI initialization and finalization
+    // https://stackoverflow.com/q/64647846
+    if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) {
+        int argc = 0;
+        char** argv = nullptr;
+        TNL::Communicators::MpiCommunicator::Init( argc, argv );
+    }
+    // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors
+    auto cleanup_callback = []() {
+        if( TNL::Communicators::MpiCommunicator::IsInitialized() )
+            TNL::Communicators::MpiCommunicator::Finalize();
+    };
+    m.add_object("_cleanup", py::capsule(cleanup_callback));
+
+    // bindings for distributed data structures
+    export_DistributedMeshes(m);
+    export_DistributedMeshReaders(m);
+
+    // bindings for functions
+    using TNL::Meshes::DistributedMeshes::distributeSubentities;
+    m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) {
+          distributeSubentities< 1 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) {
+          distributeSubentities< 2 >( mesh ); });
+}
diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h
index 7a74237f0..94e0de5f7 100644
--- a/src/Python/pytnl/typedefs.h
+++ b/src/Python/pytnl/typedefs.h
@@ -2,6 +2,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/Mesh.h>
+#include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DefaultConfig.h>
 #include <TNL/Meshes/Topologies/Edge.h>
 #include <TNL/Meshes/Topologies/Triangle.h>
@@ -37,3 +38,7 @@ using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
                             RealType,
                             IndexType,
                             LocalIndexType > >;
+
+using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >;
+using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >;
+using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >;
diff --git a/src/Python/pytnl/variant_caster.h b/src/Python/pytnl/variant_caster.h
new file mode 100644
index 000000000..c032448b5
--- /dev/null
+++ b/src/Python/pytnl/variant_caster.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <mpark/variant.hpp>   // backport of std::variant from C++17
+
+namespace pybind11 { namespace detail {
+
+// add specialization for concrete variant type
+// (variant_caster is implemented in pybind11 and used for C++17's std::variant casting)
+template<class... Args> struct type_caster<mpark::variant<Args...>>
+    : variant_caster<mpark::variant<Args...>> {};
+
+}} // namespace pybind11::detail
-- 
GitLab


From 51f77b2f597c210191091409cf1e5ecd20b8bdec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 4 Nov 2020 15:31:16 +0100
Subject: [PATCH 07/50] pytnl: set proper debug postfix and pybind11 module
 names

Python cannot easily import modules containing "-" so we use "_dbg"
instead of "-dbg".
---
 src/Python/pytnl/tnl/CMakeLists.txt                |  7 ++++++-
 src/Python/pytnl/tnl/tnl.cpp                       |  2 +-
 src/Python/pytnl/tnl_mpi/CMakeLists.txt            |  7 ++++++-
 src/Python/pytnl/tnl_mpi/DistributedMesh.cpp       |  2 +-
 .../pytnl/tnl_mpi/DistributedMeshReaders.cpp       |  2 +-
 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp               |  2 +-
 src/Python/pytnl/typedefs.h                        | 14 ++++++++++++++
 7 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index 9c95d6326..d06a4d16e 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -15,7 +15,12 @@ set( sources
 pybind11_add_module( pytnl ${sources} )
 
 # rename the shared library to tnl.cpython-XXm-x86_64-linux-gnu.so
-set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl )
+set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl DEBUG_POSTFIX "_dbg" )
+
+# indicate the postfix to the target so that the pybind11 module name can be set accordingly
+if( CMAKE_BUILD_TYPE STREQUAL "Debug")
+   target_compile_options( pytnl PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg )
+endif()
 
 # Skip -march=native -mtune=native for pytnl - optimizing python bindings for
 # a specific architecture is not very useful and prevents using Python tools on
diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp
index b60b98d68..75bd08421 100644
--- a/src/Python/pytnl/tnl/tnl.cpp
+++ b/src/Python/pytnl/tnl/tnl.cpp
@@ -24,7 +24,7 @@ template< typename T >
 using _vector = TNL::Containers::Vector< T, TNL::Devices::Host, IndexType >;
 
 // Python module definition
-PYBIND11_MODULE(tnl, m)
+PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m)
 {
     register_exceptions(m);
 
diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
index 8ea9a70c1..ee5e9cc32 100644
--- a/src/Python/pytnl/tnl_mpi/CMakeLists.txt
+++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
@@ -9,7 +9,12 @@ set( sources
 pybind11_add_module( pytnl_mpi ${sources} )
 
 # rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so
-set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi )
+set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi DEBUG_POSTFIX "_dbg" )
+
+# indicate the postfix to the target so that the pybind11 module name can be set accordingly
+if( CMAKE_BUILD_TYPE STREQUAL "Debug")
+   target_compile_options( pytnl_mpi PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg )
+endif()
 
 # Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for
 # a specific architecture is not very useful and prevents using Python tools on
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
index 253c1a9d4..03ee3692e 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
@@ -8,7 +8,7 @@
 void export_DistributedMeshes( py::module & m )
 {
     // make sure that bindings for the local meshes are available
-    py::module_::import("tnl");
+    py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl)));
 
     export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" );
     export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" );
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
index bb902b5bc..7847e340b 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
@@ -12,7 +12,7 @@ void export_DistributedMeshReaders( py::module & m )
     using PVTUReader = TNL::Meshes::Readers::PVTUReader;
 
     // make sure that bindings for the parent class are available
-    py::module_::import("tnl");
+    py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl)));
 
     py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" )
         .def(py::init<std::string>())
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
index 6d9986a7a..de2359ac2 100644
--- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -11,7 +11,7 @@ void export_DistributedMeshReaders( py::module & m );
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 
 // Python module definition
-PYBIND11_MODULE(tnl_mpi, m)
+PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m)
 {
     register_exceptions(m);
 
diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h
index 94e0de5f7..ac4b6bd83 100644
--- a/src/Python/pytnl/typedefs.h
+++ b/src/Python/pytnl/typedefs.h
@@ -1,5 +1,19 @@
 #pragma once
 
+// helper macros (the _NX variants are needed to expand macros in the arguments)
+#define PYTNL_STRINGIFY(U) PYTNL_STRINGIFY_NX(U)
+#define PYTNL_STRINGIFY_NX(U) #U
+
+#define PYTNL_PPCAT(A, B) PYTNL_PPCAT_NX(A, B)
+#define PYTNL_PPCAT_NX(A, B) A ## B
+
+// the Python module name depends on the build type, this macro can be used to concatenate with the correct suffix
+#ifdef PYTNL_MODULE_POSTFIX
+   #define PYTNL_MODULE_NAME(name) PYTNL_PPCAT(name, PYTNL_MODULE_POSTFIX)
+#else
+   #define PYTNL_MODULE_NAME(name) name
+#endif
+
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/Mesh.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
-- 
GitLab


From b8fa05a2d6fc10fa75fb4bd4723fa62d581e408e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 4 Nov 2020 15:34:53 +0100
Subject: [PATCH 08/50] pytnl: added bindings for VTKTraits

---
 src/Python/pytnl/tnl/CMakeLists.txt |  1 +
 src/Python/pytnl/tnl/VTKTraits.cpp  | 45 +++++++++++++++++++++++++++++
 src/Python/pytnl/tnl/tnl.cpp        |  3 ++
 src/TNL/Meshes/VTKTraits.h          |  6 ++--
 4 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 src/Python/pytnl/tnl/VTKTraits.cpp

diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index d06a4d16e..34fe9b179 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -10,6 +10,7 @@ set( sources
       Object.cpp
       SparseMatrix.cpp
       String.cpp
+      VTKTraits.cpp
       tnl.cpp
 )
 pybind11_add_module( pytnl ${sources} )
diff --git a/src/Python/pytnl/tnl/VTKTraits.cpp b/src/Python/pytnl/tnl/VTKTraits.cpp
new file mode 100644
index 000000000..85d796471
--- /dev/null
+++ b/src/Python/pytnl/tnl/VTKTraits.cpp
@@ -0,0 +1,45 @@
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+#include <TNL/Meshes/VTKTraits.h>
+
+void export_VTKTraits( py::module & m )
+{
+    py::enum_< TNL::Meshes::VTK::FileFormat >( m, "VTKFileFormat")
+       .value("ascii", TNL::Meshes::VTK::FileFormat::ascii)
+       .value("binary", TNL::Meshes::VTK::FileFormat::binary)
+       .value("zlib_compressed", TNL::Meshes::VTK::FileFormat::zlib_compressed)
+    ;
+    py::enum_< TNL::Meshes::VTK::DataType >( m, "VTKDataType")
+       .value("CellData", TNL::Meshes::VTK::DataType::CellData)
+       .value("PointData", TNL::Meshes::VTK::DataType::PointData)
+    ;
+    py::enum_< TNL::Meshes::VTK::EntityShape >( m, "VTKEntityShape")
+       .value("Vertex", TNL::Meshes::VTK::EntityShape::Vertex)
+       .value("PolyVertex", TNL::Meshes::VTK::EntityShape::PolyVertex)
+       .value("Line", TNL::Meshes::VTK::EntityShape::Line)
+       .value("PolyLine", TNL::Meshes::VTK::EntityShape::PolyLine)
+       .value("Triangle", TNL::Meshes::VTK::EntityShape::Triangle)
+       .value("TriangleStrip", TNL::Meshes::VTK::EntityShape::TriangleStrip)
+       .value("Polygon", TNL::Meshes::VTK::EntityShape::Polygon)
+       .value("Pixel", TNL::Meshes::VTK::EntityShape::Pixel)
+       .value("Quad", TNL::Meshes::VTK::EntityShape::Quad)
+       .value("Tetra", TNL::Meshes::VTK::EntityShape::Tetra)
+       .value("Voxel", TNL::Meshes::VTK::EntityShape::Voxel)
+       .value("Hexahedron", TNL::Meshes::VTK::EntityShape::Hexahedron)
+       .value("Wedge", TNL::Meshes::VTK::EntityShape::Wedge)
+       .value("Pyramid", TNL::Meshes::VTK::EntityShape::Pyramid)
+    ;
+    py::enum_< TNL::Meshes::VTK::CellGhostTypes >( m, "VTKCellGhostTypes")
+       .value("DUPLICATECELL", TNL::Meshes::VTK::CellGhostTypes::DUPLICATECELL,               "the cell is present on multiple processors")
+       .value("HIGHCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::HIGHCONNECTIVITYCELL, "the cell has more neighbors than in a regular mesh")
+       .value("LOWCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::LOWCONNECTIVITYCELL,   "the cell has less neighbors than in a regular mesh")
+       .value("REFINEDCELL", TNL::Meshes::VTK::CellGhostTypes::REFINEDCELL,                   "other cells are present that refines it")
+       .value("EXTERIORCELL", TNL::Meshes::VTK::CellGhostTypes::EXTERIORCELL,                 "the cell is on the exterior of the data set")
+       .value("HIDDENCELL", TNL::Meshes::VTK::CellGhostTypes::HIDDENCELL,                     "the cell is needed to maintain connectivity, but the data values should be ignored")
+    ;
+    py::enum_< TNL::Meshes::VTK::PointGhostTypes >( m, "VTKPointGhostTypes")
+       .value("DUPLICATEPOINT", TNL::Meshes::VTK::PointGhostTypes::DUPLICATEPOINT, "the cell is present on multiple processors")
+       .value("HIDDENPOINT", TNL::Meshes::VTK::PointGhostTypes::HIDDENPOINT,       "the point is needed to maintain connectivity, but the data values should be ignored")
+    ;
+}
diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp
index 75bd08421..2b2a852fa 100644
--- a/src/Python/pytnl/tnl/tnl.cpp
+++ b/src/Python/pytnl/tnl/tnl.cpp
@@ -13,6 +13,7 @@ void export_String( py::module & m );
 void export_Grid1D( py::module & m );
 void export_Grid2D( py::module & m );
 void export_Grid3D( py::module & m );
+void export_VTKTraits( py::module & m );
 void export_Meshes( py::module & m );
 void export_MeshReaders( py::module & m );
 void export_SparseMatrices( py::module & m );
@@ -42,6 +43,8 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m)
     export_Grid2D(m);
     export_Grid3D(m);
 
+    export_VTKTraits(m);
+
     export_Meshes(m);
     export_MeshReaders(m);
 
diff --git a/src/TNL/Meshes/VTKTraits.h b/src/TNL/Meshes/VTKTraits.h
index e09b6c342..0883b607a 100644
--- a/src/TNL/Meshes/VTKTraits.h
+++ b/src/TNL/Meshes/VTKTraits.h
@@ -172,16 +172,16 @@ enum class CellGhostTypes
    DUPLICATECELL = 1,        // the cell is present on multiple processors
    HIGHCONNECTIVITYCELL = 2, // the cell has more neighbors than in a regular mesh
    LOWCONNECTIVITYCELL = 4,  // the cell has less neighbors than in a regular mesh
-   REFINEDCELL = 8,          // other cells are present that refines it.
+   REFINEDCELL = 8,          // other cells are present that refines it
    EXTERIORCELL = 16,        // the cell is on the exterior of the data set
-   HIDDENCELL = 32           // the cell is needed to maintain connectivity, but the data values should be ignored.
+   HIDDENCELL = 32           // the cell is needed to maintain connectivity, but the data values should be ignored
 };
 
 enum class PointGhostTypes
 : std::uint8_t
 {
    DUPLICATEPOINT = 1,  // the cell is present on multiple processors
-   HIDDENPOINT = 2      // the point is needed to maintain connectivity, but the data values should be ignored.
+   HIDDENPOINT = 2      // the point is needed to maintain connectivity, but the data values should be ignored
 };
 
 /**
-- 
GitLab


From 85c1a04e2321a137c9c1e295341157f9ff496bd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 4 Nov 2020 23:09:09 +0100
Subject: [PATCH 09/50] Mesh writers: avoid reference in class members,
 initialize by rdbuf

Using references as class members is weird, because they cannot extend
the lifetime of the objects they are initialized with. Using values of
the type std::ostream and initializing by rdbuf (which is a pointer)
works better, probably because the underlying rdbuf generally outlives
the ostreams that were being passed to the writers.
---
 src/TNL/Meshes/Writers/PVTUWriter.h | 4 ++--
 src/TNL/Meshes/Writers/VTKWriter.h  | 4 ++--
 src/TNL/Meshes/Writers/VTUWriter.h  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h
index 8ef4d2b7b..5aa9cd2b0 100644
--- a/src/TNL/Meshes/Writers/PVTUWriter.h
+++ b/src/TNL/Meshes/Writers/PVTUWriter.h
@@ -31,7 +31,7 @@ public:
    PVTUWriter() = delete;
 
    PVTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {}
 
    // If desired, cycle and time of the simulation can put into the file. This follows the instructions at
@@ -79,7 +79,7 @@ protected:
 
    void writeFooter();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
diff --git a/src/TNL/Meshes/Writers/VTKWriter.h b/src/TNL/Meshes/Writers/VTKWriter.h
index e1c5fae97..db0c09b13 100644
--- a/src/TNL/Meshes/Writers/VTKWriter.h
+++ b/src/TNL/Meshes/Writers/VTKWriter.h
@@ -45,7 +45,7 @@ public:
    VTKWriter() = delete;
 
    VTKWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::binary )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {
       if( format != VTK::FileFormat::ascii && format != VTK::FileFormat::binary )
          throw std::domain_error("The Legacy VTK file formats support only ASCII and BINARY formats.");
@@ -78,7 +78,7 @@ protected:
 
    void writeHeader();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
diff --git a/src/TNL/Meshes/Writers/VTUWriter.h b/src/TNL/Meshes/Writers/VTUWriter.h
index 9f715dce6..00765cc0d 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.h
+++ b/src/TNL/Meshes/Writers/VTUWriter.h
@@ -44,7 +44,7 @@ public:
    VTUWriter() = delete;
 
    VTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {}
 
    // If desired, cycle and time of the simulation can put into the file. This follows the instructions at
@@ -78,7 +78,7 @@ protected:
 
    void writeFooter();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
-- 
GitLab


From 912af6da79cb51f6f2faaaa67d8e1f9e98374ade Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 5 Nov 2020 23:47:09 +0100
Subject: [PATCH 10/50] Fixed HostArray type in mesh writers

- ValueType must be non-const (otherwise it won't work with an array
  view with a const ValueType)
- IndexType should be taken from the array
---
 src/TNL/Meshes/Writers/VTKWriter.hpp | 2 +-
 src/TNL/Meshes/Writers/VTUWriter.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Meshes/Writers/VTKWriter.hpp b/src/TNL/Meshes/Writers/VTKWriter.hpp
index 125366d03..801d3bc19 100644
--- a/src/TNL/Meshes/Writers/VTKWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTKWriter.hpp
@@ -509,7 +509,7 @@ VTKWriter< Mesh >::writeDataArray( const Array& array,
    // use a host buffer if direct access to the array elements is not possible
    if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value )
    {
-      using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >;
+      using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >;
       HostArray hostBuffer;
       hostBuffer = array;
       writeDataArray( hostBuffer, name, numberOfComponents );
diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp
index 8d609f0a7..61872ffe1 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTUWriter.hpp
@@ -459,7 +459,7 @@ VTUWriter< Mesh >::writeDataArray( const Array& array,
    // use a host buffer if direct access to the array elements is not possible
    if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value )
    {
-      using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >;
+      using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >;
       HostArray hostBuffer;
       hostBuffer = array;
       writeDataArray( hostBuffer, name, numberOfComponents );
-- 
GitLab


From b426950f236e14edf99ed11a257adab9475176d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 4 Nov 2020 23:14:04 +0100
Subject: [PATCH 11/50] pytnl: added bindings for mesh writers

---
 src/3rdparty/CMakeLists.txt          |   6 +
 src/3rdparty/cctbx/pystreambuf.h     | 519 +++++++++++++++++++++++++++
 src/Python/pytnl/iostream_caster.h   |  59 +++
 src/Python/pytnl/tnl/CMakeLists.txt  |   1 +
 src/Python/pytnl/tnl/MeshWriters.cpp |  88 +++++
 src/Python/pytnl/tnl/MeshWriters.h   |  22 ++
 src/Python/pytnl/tnl/tnl.cpp         |   2 +
 src/Python/pytnl/tnl_conversions.h   |   1 +
 8 files changed, 698 insertions(+)
 create mode 100644 src/3rdparty/cctbx/pystreambuf.h
 create mode 100644 src/Python/pytnl/iostream_caster.h
 create mode 100644 src/Python/pytnl/tnl/MeshWriters.cpp
 create mode 100644 src/Python/pytnl/tnl/MeshWriters.h

diff --git a/src/3rdparty/CMakeLists.txt b/src/3rdparty/CMakeLists.txt
index 6dba288f0..01550de19 100644
--- a/src/3rdparty/CMakeLists.txt
+++ b/src/3rdparty/CMakeLists.txt
@@ -1,3 +1,9 @@
 install( DIRECTORY mpark Leksys TYPE INCLUDE
          MESSAGE_NEVER
          FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" )
+
+if( ${WITH_PYTHON} )
+   install( DIRECTORY cctbx TYPE INCLUDE
+            MESSAGE_NEVER
+            FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" )
+endif()
diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h
new file mode 100644
index 000000000..6e0c497e4
--- /dev/null
+++ b/src/3rdparty/cctbx/pystreambuf.h
@@ -0,0 +1,519 @@
+/* Original code: https://gist.github.com/asford/544323a5da7dddad2c9174490eb5ed06
+ * License:
+ * This component utilizes components derived from cctbx, available at
+ * http://cci.lbl.gov/cctbx_sources/boost_adaptbx/python_streambuf.h
+ *
+ * *** License agreement ***
+ *
+ * cctbx Copyright (c) 2006, The Regents of the University of
+ * California, through Lawrence Berkeley National Laboratory (subject to
+ * receipt of any required approvals from the U.S. Dept. of Energy).  All
+ * rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * (1) Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * (2) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * (3) Neither the name of the University of California, Lawrence Berkeley
+ * National Laboratory, U.S. Dept. of Energy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You are under no obligation whatsoever to provide any bug fixes,
+ * patches, or upgrades to the features, functionality or performance of
+ * the source code ("Enhancements") to anyone; however, if you choose to
+ * make your Enhancements available either publicly, or directly to
+ * Lawrence Berkeley National Laboratory, without imposing a separate
+ * written license agreement for such Enhancements, then you hereby grant
+ * the following license: a  non-exclusive, royalty-free perpetual license
+ * to install, use, modify, prepare derivative works, incorporate into
+ * other computer software, distribute, and sublicense such enhancements or
+ * derivative works thereof, in binary and source code form.
+*/
+
+#pragma once
+
+#include <streambuf>
+#include <iostream>
+
+#include <pybind11/pybind11.h>
+
+namespace pystreambuf {
+
+/// A stream buffer getting data from and putting data into a Python file object
+/** The aims are as follow:
+
+    - Given a C++ function acting on a standard stream, e.g.
+
+      \code
+      void read_inputs(std::istream& input) {
+        ...
+        input >> something >> something_else;
+      }
+      \endcode
+
+      and given a piece of Python code which creates a file-like object,
+      to be able to pass this file object to that C++ function, e.g.
+
+      \code
+      import gzip
+      gzip_file_obj = gzip.GzipFile(...)
+      read_inputs(gzip_file_obj)
+      \endcode
+
+      and have the standard stream pull data from and put data into the Python
+      file object.
+
+    - When Python \c read_inputs() returns, the Python object is able to
+      continue reading or writing where the C++ code left off.
+
+    - Operations in C++ on mere files should be competitively fast compared
+      to the direct use of \c std::fstream.
+
+
+    \b Motivation
+
+      - the standard Python library offer of file-like objects (files,
+        compressed files and archives, network, ...) is far superior to the
+        offer of streams in the C++ standard library and Boost C++ libraries.
+
+      - i/o code involves a fair amount of text processing which is more
+        efficiently prototyped in Python but then one may need to rewrite
+        a time-critical part in C++, in as seamless a manner as possible.
+
+    \b Usage
+
+    This is 2-step:
+
+      - a trivial wrapper function
+
+        \code
+          using boost_adaptbx::python::streambuf;
+          void read_inputs_wrapper(streambuf& input)
+          {
+            streambuf::istream is(input);
+            read_inputs(is);
+          }
+
+          def("read_inputs", read_inputs_wrapper);
+        \endcode
+
+        which has to be written every time one wants a Python binding for
+        such a C++ function.
+
+      - the Python side
+
+        \code
+          from boost.python import streambuf
+          read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
+        \endcode
+
+        \c buffer_size is optional. See also: \c default_buffer_size
+
+  Note: references are to the C++ standard (the numbers between parentheses
+  at the end of references are margin markers).
+*/
+class streambuf : public std::basic_streambuf<char>
+{
+  private:
+    typedef std::basic_streambuf<char> base_t;
+
+  public:
+    /* The syntax
+        using base_t::char_type;
+       would be nicer but Visual Studio C++ 8 chokes on it
+    */
+    typedef base_t::char_type   char_type;
+    typedef base_t::int_type    int_type;
+    typedef base_t::pos_type    pos_type;
+    typedef base_t::off_type    off_type;
+    typedef base_t::traits_type traits_type;
+
+    /// The default size of the read and write buffer.
+    /** They are respectively used to buffer data read from and data written to
+        the Python file object. It can be modified from Python.
+    */
+    static constexpr std::size_t default_buffer_size = 1024;
+
+    /// Construct from a Python file object
+    /** if buffer_size is 0 the current default_buffer_size is used.
+    */
+    streambuf(
+      pybind11::object& python_file_obj,
+      std::size_t buffer_size_=0)
+    :
+      py_read (getattr(python_file_obj, "read", pybind11::none())),
+      py_write (getattr(python_file_obj, "write", pybind11::none())),
+      py_seek (getattr(python_file_obj, "seek", pybind11::none())),
+      py_tell (getattr(python_file_obj, "tell", pybind11::none())),
+      buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
+      write_buffer(0),
+      pos_of_read_buffer_end_in_py_file(0),
+      pos_of_write_buffer_end_in_py_file(buffer_size),
+      farthest_pptr(0)
+    {
+      assert(buffer_size != 0);
+      /* Some Python file objects (e.g. sys.stdout and sys.stdin)
+         have non-functional seek and tell. If so, assign None to
+         py_tell and py_seek.
+       */
+      if (!py_tell.is_none()) {
+        try {
+          py_tell();
+        }
+        catch (pybind11::error_already_set& err) {
+          py_tell = pybind11::none();
+          py_seek = pybind11::none();
+          err.restore();
+          PyErr_Clear();
+        }
+      }
+
+      if (!py_write.is_none()) {
+        // C-like string to make debugging easier
+        write_buffer = new char[buffer_size + 1];
+        write_buffer[buffer_size] = '\0';
+        setp(write_buffer, write_buffer + buffer_size);  // 27.5.2.4.5 (5)
+        farthest_pptr = pptr();
+      }
+      else {
+        // The first attempt at output will result in a call to overflow
+        setp(0, 0);
+      }
+
+      if (!py_tell.is_none()){
+        off_type py_pos = py_tell().cast<off_type>();
+        pos_of_read_buffer_end_in_py_file = py_pos;
+        pos_of_write_buffer_end_in_py_file = py_pos;
+      }
+    }
+
+    /// Mundane destructor freeing the allocated resources
+    virtual ~streambuf() {
+      if (write_buffer) delete[] write_buffer;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.3
+    /** It is essential to override this virtual function for the stream
+        member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
+     */
+    virtual std::streamsize showmanyc() {
+      int_type const failure = traits_type::eof();
+      int_type status = underflow();
+      if (status == failure) return -1;
+      return egptr() - gptr();
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.3
+    virtual int_type underflow() {
+      int_type const failure = traits_type::eof();
+      if (py_read.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'read' attribute");
+      }
+      read_buffer = py_read(buffer_size);
+      char *read_buffer_data;
+      pybind11::ssize_t py_n_read;
+      if (PYBIND11_BYTES_AS_STRING_AND_SIZE(read_buffer.ptr(),
+            &read_buffer_data, &py_n_read) == -1) {
+        setg(0, 0, 0);
+        throw std::invalid_argument(
+          "The method 'read' of the Python file object "
+          "did not return a string.");
+      }
+      off_type n_read = (off_type)py_n_read;
+      pos_of_read_buffer_end_in_py_file += n_read;
+      setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
+      // ^^^27.5.2.3.1 (4)
+      if (n_read == 0) return failure;
+      return traits_type::to_int_type(read_buffer_data[0]);
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.5
+    virtual int_type overflow(int_type c=traits_type::eof()) {
+      if (py_write.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'write' attribute");
+      }
+      farthest_pptr = std::max(farthest_pptr, pptr());
+      off_type n_written = (off_type)(farthest_pptr - pbase());
+      pybind11::bytes chunk(pbase(), n_written);
+      py_write(chunk);
+      if (!traits_type::eq_int_type(c, traits_type::eof())) {
+        py_write(traits_type::to_char_type(c));
+        n_written++;
+      }
+      if (n_written) {
+        pos_of_write_buffer_end_in_py_file += n_written;
+        setp(pbase(), epptr());
+        // ^^^ 27.5.2.4.5 (5)
+        farthest_pptr = pptr();
+      }
+      return traits_type::eq_int_type(
+        c, traits_type::eof()) ? traits_type::not_eof(c) : c;
+    }
+
+    /// Update the python file to reflect the state of this stream buffer
+    /** Empty the write buffer into the Python file object and set the seek
+        position of the latter accordingly (C++ standard section 27.5.2.4.2).
+        If there is no write buffer or it is empty, but there is a non-empty
+        read buffer, set the Python file object seek position to the
+        seek position in that read buffer.
+    */
+    virtual int sync() {
+      int result = 0;
+      farthest_pptr = std::max(farthest_pptr, pptr());
+      if (farthest_pptr && farthest_pptr > pbase()) {
+        off_type delta = pptr() - farthest_pptr;
+        int_type status = overflow();
+        if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
+        if (!py_seek.is_none()) py_seek(delta, 1);
+      }
+      else if (gptr() && gptr() < egptr()) {
+        if (!py_seek.is_none()) py_seek(gptr() - egptr(), 1);
+      }
+      return result;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.2
+    /** This implementation is optimised to look whether the position is within
+        the buffers, so as to avoid calling Python seek or tell. It is
+        important for many applications that the overhead of calling into Python
+        is avoided as much as possible (e.g. parsers which may do a lot of
+        backtracking)
+    */
+    virtual
+    pos_type seekoff(off_type off, std::ios_base::seekdir way,
+                     std::ios_base::openmode which=  std::ios_base::in
+                                                   | std::ios_base::out)
+    {
+      /* In practice, "which" is either std::ios_base::in or out
+         since we end up here because either seekp or seekg was called
+         on the stream using this buffer. That simplifies the code
+         in a few places.
+      */
+      int const failure = off_type(-1);
+
+      if (py_seek.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'seek' attribute");
+      }
+
+      // we need the read buffer to contain something!
+      if (which == std::ios_base::in && !gptr()) {
+        if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
+          return failure;
+        }
+      }
+
+      // compute the whence parameter for Python seek
+      int whence;
+      switch (way) {
+        case std::ios_base::beg:
+          whence = 0;
+          break;
+        case std::ios_base::cur:
+          whence = 1;
+          break;
+        case std::ios_base::end:
+          whence = 2;
+          break;
+        default:
+          return failure;
+      }
+
+      // Let's have a go
+      off_type result;
+      if (!seekoff_without_calling_python(off, way, which, result)) {
+        // we need to call Python
+        if (which == std::ios_base::out) overflow();
+        if (way == std::ios_base::cur) {
+          if      (which == std::ios_base::in)  off -= egptr() - gptr();
+          else if (which == std::ios_base::out) off += pptr() - pbase();
+        }
+        py_seek(off, whence);
+        result = off_type(py_tell().cast<off_type>());
+        if (which == std::ios_base::in) underflow();
+      }
+      return result;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.2
+    virtual
+    pos_type seekpos(pos_type sp,
+                     std::ios_base::openmode which=  std::ios_base::in
+                                                   | std::ios_base::out)
+    {
+      return streambuf::seekoff(sp, std::ios_base::beg, which);
+    }
+
+  private:
+    pybind11::object py_read, py_write, py_seek, py_tell;
+
+    std::size_t buffer_size;
+
+    /* This is actually a Python bytes object and the actual read buffer is
+       its internal data, i.e. an array of characters.
+     */
+    pybind11::bytes read_buffer;
+
+    /* A mere array of char's allocated on the heap at construction time and
+       de-allocated only at destruction time.
+    */
+    char *write_buffer;
+
+    off_type pos_of_read_buffer_end_in_py_file,
+             pos_of_write_buffer_end_in_py_file;
+
+    // the farthest place the buffer has been written into
+    char *farthest_pptr;
+
+
+    bool seekoff_without_calling_python(
+      off_type off,
+      std::ios_base::seekdir way,
+      std::ios_base::openmode which,
+      off_type & result)
+    {
+      // Buffer range and current position
+      off_type buf_begin, buf_end, buf_cur, upper_bound;
+      off_type pos_of_buffer_end_in_py_file;
+      if (which == std::ios_base::in) {
+        pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
+        buf_begin = reinterpret_cast<std::streamsize>(eback());
+        buf_cur = reinterpret_cast<std::streamsize>(gptr());
+        buf_end = reinterpret_cast<std::streamsize>(egptr());
+        upper_bound = buf_end;
+      }
+      else if (which == std::ios_base::out) {
+        pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
+        buf_begin = reinterpret_cast<std::streamsize>(pbase());
+        buf_cur = reinterpret_cast<std::streamsize>(pptr());
+        buf_end = reinterpret_cast<std::streamsize>(epptr());
+        farthest_pptr = std::max(farthest_pptr, pptr());
+        upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
+      }
+      else {
+           std::runtime_error(
+             "Control flow passes through branch that should be unreachable.");
+      }
+
+      // Sought position in "buffer coordinate"
+      off_type buf_sought;
+      if (way == std::ios_base::cur) {
+        buf_sought = buf_cur + off;
+      }
+      else if (way == std::ios_base::beg) {
+        buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
+      }
+      else if (way == std::ios_base::end) {
+        return false;
+      }
+      else {
+           std::runtime_error(
+             "Control flow passes through branch that should be unreachable.");
+      }
+
+      // if the sought position is not in the buffer, give up
+      if (buf_sought < buf_begin || buf_sought >= upper_bound) return false;
+
+      // we are in wonderland
+      if      (which == std::ios_base::in)  gbump(buf_sought - buf_cur);
+      else if (which == std::ios_base::out) pbump(buf_sought - buf_cur);
+
+      result = pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
+      return true;
+    }
+
+  public:
+
+    class istream : public std::istream
+    {
+      public:
+        istream(streambuf& buf) : std::istream(&buf)
+        {
+          exceptions(std::ios_base::badbit);
+        }
+
+        ~istream() { if (this->good()) this->sync(); }
+    };
+
+    class ostream : public std::ostream
+    {
+      public:
+        ostream(streambuf& buf) : std::ostream(&buf)
+        {
+          exceptions(std::ios_base::badbit);
+        }
+
+        ~ostream() { if (this->good()) this->flush(); }
+    };
+};
+
+struct streambuf_capsule
+{
+  streambuf python_streambuf;
+
+  streambuf_capsule(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    python_streambuf(python_file_obj, buffer_size)
+  {}
+};
+
+struct ostream : private streambuf_capsule, streambuf::ostream
+{
+  ostream(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    streambuf_capsule(python_file_obj, buffer_size),
+    streambuf::ostream(python_streambuf)
+  {}
+
+  ~ostream()
+  {
+    if (this->good()){
+      this->flush();
+    }
+  }
+};
+
+struct istream : private streambuf_capsule, streambuf::istream
+{
+  istream(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    streambuf_capsule(python_file_obj, buffer_size),
+    streambuf::istream(python_streambuf)
+  {}
+
+  ~istream()
+  {
+    if (this->good()) {
+      this->sync();
+    }
+  }
+};
+
+} // namespace pystreambuf
diff --git a/src/Python/pytnl/iostream_caster.h b/src/Python/pytnl/iostream_caster.h
new file mode 100644
index 000000000..38f5d4e16
--- /dev/null
+++ b/src/Python/pytnl/iostream_caster.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cctbx/pystreambuf.h>
+
+namespace pybind11 { namespace detail {
+    template <> struct type_caster<std::istream> {
+    public:
+        bool load(handle src, bool) {
+            if (getattr(src, "read", none()).is_none()){
+              return false;
+            }
+
+            obj = reinterpret_borrow<object>(src);
+            value = std::unique_ptr<pystreambuf::istream>(new pystreambuf::istream(obj, 0));
+
+            return true;
+        }
+
+    protected:
+        object obj;
+        std::unique_ptr<pystreambuf::istream> value;
+
+    public:
+        static constexpr auto name = _("istream");
+        static handle cast(const std::istream *src, return_value_policy policy, handle parent) {
+            return none().release();
+        }
+        operator std::istream*() { return value.get(); }
+        operator std::istream&() { return *value; }
+        template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    };
+
+    template <> struct type_caster<std::ostream> {
+    public:
+        bool load(handle src, bool) {
+            if (getattr(src, "write", none()).is_none()){
+              return false;
+            }
+
+            obj = reinterpret_borrow<object>(src);
+            value = std::unique_ptr<pystreambuf::ostream>(new pystreambuf::ostream(obj, 0));
+
+            return true;
+        }
+
+    protected:
+        object obj;
+        std::unique_ptr<pystreambuf::ostream> value;
+
+    public:
+        static constexpr auto name = _("ostream");
+        static handle cast(const std::ostream *src, return_value_policy policy, handle parent) {
+            return none().release();
+        }
+        operator std::ostream*() { return value.get(); }
+        operator std::ostream&() { return *value; }
+        template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    };
+}} // namespace pybind11::detail
diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index 34fe9b179..dc1c3fcc3 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -7,6 +7,7 @@ set( sources
       Grid3D.cpp
       Mesh.cpp
       MeshReaders.cpp
+      MeshWriters.cpp
       Object.cpp
       SparseMatrix.cpp
       String.cpp
diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp
new file mode 100644
index 000000000..17c3c7492
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshWriters.cpp
@@ -0,0 +1,88 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "MeshWriters.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/MeshReader.h>
+
+#include <TNL/Meshes/Writers/VTKWriter.h>
+#include <TNL/Meshes/Writers/VTUWriter.h>
+
+template< typename Writer, TNL::Meshes::VTK::FileFormat default_format >
+void export_MeshWriter( py::module & m, const char* name )
+{
+    // We cannot use MeshReader::VariantVector for Python bindings, because its variants are
+    // std::vector<T> for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t,
+    // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map
+    // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given
+    // Python value fits into the C++ type when selecting the alternative for a scalar type, and
+    // for containers like std::vector it merely selects the first possible type. For reference, see
+    // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161
+    using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >;
+
+    // Binding to Writer directly is not possible, because the writer has a std::ostream attribute
+    // which would reference the streambuf created by the type caster from the Python file-like object.
+    // However, the streambuf would be destroyed as soon as the writer is constructed and control
+    // returned to Python, so the following invokations would use an invalid object and segfault.
+    // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own
+    // ostream attribute and is initialized by a py::object to avoid type casting.
+    using PythonWriter = PyWriter< Writer, default_format >;
+    py::class_< PythonWriter >( m, name )
+        .def(py::init<py::object, TNL::Meshes::VTK::FileFormat>(), py::keep_alive<1, 2>(),
+              py::arg("stream"), py::pos_only(), py::arg("format") = default_format)
+        .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1)
+        .def("writeVertices", &Writer::template writeEntities< 0 >)
+        .def("writeCells", &Writer::template writeEntities<>)
+        // we use the VariantVector from MeshReader because we already have a caster for it
+        .def("writePointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writePointData( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writeCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writeCellData( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writeDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writeDataArray( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+    ;
+}
+
+void export_MeshWriters( py::module & m )
+{
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfEdges" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTriangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTetrahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" );
+}
diff --git a/src/Python/pytnl/tnl/MeshWriters.h b/src/Python/pytnl/tnl/MeshWriters.h
new file mode 100644
index 000000000..9dd7185ea
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshWriters.h
@@ -0,0 +1,22 @@
+#include "../iostream_caster.h"
+#include <TNL/Meshes/VTKTraits.h>
+
+// helper struct is needed to ensure correct initialization order in the PyWriter constructor
+struct PyOstreamHelper
+{
+   py::object obj;
+   pystreambuf::ostream str;
+
+   PyOstreamHelper( py::object src )
+      : obj(py::reinterpret_borrow<py::object>(src)),
+        str(obj)
+   {}
+};
+
+template< typename Writer, TNL::Meshes::VTK::FileFormat default_format >
+struct PyWriter : public PyOstreamHelper, public Writer
+{
+   PyWriter( py::object src, TNL::Meshes::VTK::FileFormat format = default_format )
+   : PyOstreamHelper(src), Writer(str)
+   {}
+};
diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp
index 2b2a852fa..65e9c14e4 100644
--- a/src/Python/pytnl/tnl/tnl.cpp
+++ b/src/Python/pytnl/tnl/tnl.cpp
@@ -16,6 +16,7 @@ void export_Grid3D( py::module & m );
 void export_VTKTraits( py::module & m );
 void export_Meshes( py::module & m );
 void export_MeshReaders( py::module & m );
+void export_MeshWriters( py::module & m );
 void export_SparseMatrices( py::module & m );
 
 template< typename T >
@@ -47,6 +48,7 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m)
 
     export_Meshes(m);
     export_MeshReaders(m);
+    export_MeshWriters(m);
 
     export_SparseMatrices(m);
 }
diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h
index e942db324..788a54813 100644
--- a/src/Python/pytnl/tnl_conversions.h
+++ b/src/Python/pytnl/tnl_conversions.h
@@ -2,3 +2,4 @@
 #include "tnl_str_conversion.h"
 #include "tnl_tuple_conversion.h"
 #include "variant_caster.h"
+#include "iostream_caster.h"
-- 
GitLab


From 9123ea3ff10028ce2bc1ab030e1dd3a6b0df6947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 6 Nov 2020 22:58:46 +0100
Subject: [PATCH 12/50] pytnl: added bindings for distributed mesh writers

---
 src/Python/pytnl/tnl_mpi/CMakeLists.txt       |  1 +
 .../pytnl/tnl_mpi/DistributedMeshWriters.cpp  | 94 +++++++++++++++++++
 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp          |  2 +
 3 files changed, 97 insertions(+)
 create mode 100644 src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp

diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
index ee5e9cc32..2aa8f73da 100644
--- a/src/Python/pytnl/tnl_mpi/CMakeLists.txt
+++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
@@ -4,6 +4,7 @@ set(PYBIND11_CPP_STANDARD -std=c++14)
 set( sources
       DistributedMesh.cpp
       DistributedMeshReaders.cpp
+      DistributedMeshWriters.cpp
       tnl_mpi.cpp
 )
 pybind11_add_module( pytnl_mpi ${sources} )
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
new file mode 100644
index 000000000..4d1d18bae
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
@@ -0,0 +1,94 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../tnl/MeshWriters.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/MeshReader.h>
+
+#include <TNL/Meshes/Writers/PVTUWriter.h>
+
+template< template<typename> class WriterTemplate, typename LocalMesh, TNL::Meshes::VTK::FileFormat default_format >
+void export_DistributedMeshWriter( py::module & m, const char* name )
+{
+    using Writer = WriterTemplate< LocalMesh >;
+    using Mesh = TNL::Meshes::DistributedMeshes::DistributedMesh< LocalMesh >;
+
+    // We cannot use MeshReader::VariantVector for Python bindings, because its variants are
+    // std::vector<T> for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t,
+    // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map
+    // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given
+    // Python value fits into the C++ type when selecting the alternative for a scalar type, and
+    // for containers like std::vector it merely selects the first possible type. For reference, see
+    // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161
+    using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >;
+
+    // Binding to Writer directly is not possible, because the writer has a std::ostream attribute
+    // which would reference the streambuf created by the type caster from the Python file-like object.
+    // However, the streambuf would be destroyed as soon as the writer is constructed and control
+    // returned to Python, so the following invokations would use an invalid object and segfault.
+    // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own
+    // ostream attribute and is initialized by a py::object to avoid type casting.
+    using PythonWriter = PyWriter< Writer, default_format >;
+    py::class_< PythonWriter >( m, name )
+        .def(py::init<py::object, TNL::Meshes::VTK::FileFormat>(), py::keep_alive<1, 2>(),
+              py::arg("stream"), py::pos_only(), py::arg("format") = default_format)
+        .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1)
+        .def("writeVertices", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities< 0 >),
+              py::arg("distributedMesh"))
+        .def("writeVertices", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities< 0 >),
+              py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0)
+        .def("writeCells", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities<>),
+              py::arg("distributedMesh"))
+        .def("writeCells", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities<>),
+              py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0)
+        // INCONSISTENCY: the C++ methods writePPointData, writePCellData, writePDataArray do not
+        // take the whole array as parameter, only the ValueType as a template parameter. Since
+        // this does not map nicely to Python, we pass the whole array just like in the
+        // VTKWriter and VTUWriter classes.
+        // we use the VariantVector from MeshReader because we already have a caster for it
+        .def("writePPointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePPointData< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writePCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePCellData< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writePDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePDataArray< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        // NOTE: only the overload intended for sequential writing is exported, because we don't
+        // have type casters for Communicators::MpiCommunicator::CommunicationGroup
+        // (ideally, the communication group would be compatible with the mpi4py objects)
+        .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ),
+              py::arg("mainFileName"), py::arg("subdomainIndex"))
+    ;
+}
+
+void export_DistributedMeshWriters( py::module & m )
+{
+    constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed;
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges,        default_format >( m, "PVTUWriter_MeshOfEdges" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles,    default_format >( m, "PVTUWriter_MeshOfTriangles" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" );
+}
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
index de2359ac2..be7813959 100644
--- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -7,6 +7,7 @@
 // external functions
 void export_DistributedMeshes( py::module & m );
 void export_DistributedMeshReaders( py::module & m );
+void export_DistributedMeshWriters( py::module & m );
 
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 
@@ -32,6 +33,7 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m)
     // bindings for distributed data structures
     export_DistributedMeshes(m);
     export_DistributedMeshReaders(m);
+    export_DistributedMeshWriters(m);
 
     // bindings for functions
     using TNL::Meshes::DistributedMeshes::distributeSubentities;
-- 
GitLab


From 5a62beedbd1da99c2da0ecd41153d4ee74a034a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 7 Nov 2020 11:38:02 +0100
Subject: [PATCH 13/50] pystreambuf: fixed broken overflow() method and enabled
 exceptions for failbit

---
 src/3rdparty/cctbx/pystreambuf.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h
index 6e0c497e4..d2d67730a 100644
--- a/src/3rdparty/cctbx/pystreambuf.h
+++ b/src/3rdparty/cctbx/pystreambuf.h
@@ -188,9 +188,8 @@ class streambuf : public std::basic_streambuf<char>
       }
 
       if (!py_write.is_none()) {
-        // C-like string to make debugging easier
+        // add one extra byte for characters passed to the overflow() method
         write_buffer = new char[buffer_size + 1];
-        write_buffer[buffer_size] = '\0';
         setp(write_buffer, write_buffer + buffer_size);  // 27.5.2.4.5 (5)
         farthest_pptr = pptr();
       }
@@ -255,12 +254,13 @@ class streambuf : public std::basic_streambuf<char>
       }
       farthest_pptr = std::max(farthest_pptr, pptr());
       off_type n_written = (off_type)(farthest_pptr - pbase());
-      pybind11::bytes chunk(pbase(), n_written);
-      py_write(chunk);
       if (!traits_type::eq_int_type(c, traits_type::eof())) {
-        py_write(traits_type::to_char_type(c));
-        n_written++;
+        // add the overflown character to the end of the buffer
+        // (we have one extra byte just for that)
+        write_buffer[n_written++] = traits_type::to_char_type(c);
       }
+      pybind11::bytes chunk(pbase(), n_written);
+      py_write(chunk);
       if (n_written) {
         pos_of_write_buffer_end_in_py_file += n_written;
         setp(pbase(), epptr());
@@ -450,7 +450,7 @@ class streambuf : public std::basic_streambuf<char>
       public:
         istream(streambuf& buf) : std::istream(&buf)
         {
-          exceptions(std::ios_base::badbit);
+          exceptions(std::ios_base::badbit | std::ios_base::failbit);
         }
 
         ~istream() { if (this->good()) this->sync(); }
@@ -461,7 +461,7 @@ class streambuf : public std::basic_streambuf<char>
       public:
         ostream(streambuf& buf) : std::ostream(&buf)
         {
-          exceptions(std::ios_base::badbit);
+          exceptions(std::ios_base::badbit | std::ios_base::failbit);
         }
 
         ~ostream() { if (this->good()) this->flush(); }
-- 
GitLab


From afcc762a2f0f932051193d036b7d8a1a81122e1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Nov 2020 12:09:34 +0100
Subject: [PATCH 14/50] Mesh readers: changed methods readPointData and
 readCellData to be virtual

Also renamed PVTUReader's readLocalPointData and readLocalCellData to
fit into this hierarchy, hopefully it is clear that they return only
local data.
---
 src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp |  2 --
 src/TNL/Meshes/Readers/MeshReader.h                 | 12 ++++++++++++
 src/TNL/Meshes/Readers/PVTUReader.h                 |  8 ++++----
 src/TNL/Meshes/Readers/XMLVTK.h                     |  8 ++++----
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
index 7847e340b..e972eb65e 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
@@ -20,7 +20,5 @@ void export_DistributedMeshReaders( py::module & m )
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >)
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >)
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >)
-        .def("readLocalPointData", &PVTUReader::readLocalPointData)
-        .def("readLocalCellData", &PVTUReader::readLocalCellData)
     ;
 }
diff --git a/src/TNL/Meshes/Readers/MeshReader.h b/src/TNL/Meshes/Readers/MeshReader.h
index 88e2986ba..8bf8189ba 100644
--- a/src/TNL/Meshes/Readers/MeshReader.h
+++ b/src/TNL/Meshes/Readers/MeshReader.h
@@ -150,6 +150,18 @@ public:
          throw MeshReaderError( "VTKReader", "MeshBuilder failed" );
    }
 
+   virtual VariantVector
+   readPointData( std::string arrayName )
+   {
+      throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." );
+   }
+
+   virtual VariantVector
+   readCellData( std::string arrayName )
+   {
+      throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." );
+   }
+
    std::string
    getMeshType() const
    {
diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h
index 4bb8ba7eb..393ee1551 100644
--- a/src/TNL/Meshes/Readers/PVTUReader.h
+++ b/src/TNL/Meshes/Readers/PVTUReader.h
@@ -211,14 +211,14 @@ public:
       mesh.setCommunicationGroup( group );
    }
 
-   VariantVector
-   readLocalPointData( std::string arrayName )
+   virtual VariantVector
+   readPointData( std::string arrayName ) override
    {
       return localReader.readPointData( arrayName );
    }
 
-   VariantVector
-   readLocalCellData( std::string arrayName )
+   virtual VariantVector
+   readCellData( std::string arrayName ) override
    {
       return localReader.readCellData( arrayName );
    }
diff --git a/src/TNL/Meshes/Readers/XMLVTK.h b/src/TNL/Meshes/Readers/XMLVTK.h
index fb8e1eb40..af864e6e9 100644
--- a/src/TNL/Meshes/Readers/XMLVTK.h
+++ b/src/TNL/Meshes/Readers/XMLVTK.h
@@ -325,8 +325,8 @@ public:
 #endif
    }
 
-   VariantVector
-   readPointData( std::string arrayName )
+   virtual VariantVector
+   readPointData( std::string arrayName ) override
    {
 #ifdef HAVE_TINYXML2
       return readPointOrCellData( "PointData", arrayName );
@@ -335,8 +335,8 @@ public:
 #endif
    }
 
-   VariantVector
-   readCellData( std::string arrayName )
+   virtual VariantVector
+   readCellData( std::string arrayName ) override
    {
 #ifdef HAVE_TINYXML2
       return readPointOrCellData( "CellData", arrayName );
-- 
GitLab


From 7734ee3c56083c9fb8e3a48dd42cce32abd33165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Nov 2020 12:32:39 +0100
Subject: [PATCH 15/50] Added function getMeshReader

It is useful especially when one wants to load mesh functions via the
readPointData or readCellData methods when the mesh was already loaded.
---
 src/TNL/Meshes/Readers/getMeshReader.h | 58 ++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 src/TNL/Meshes/Readers/getMeshReader.h

diff --git a/src/TNL/Meshes/Readers/getMeshReader.h b/src/TNL/Meshes/Readers/getMeshReader.h
new file mode 100644
index 000000000..2c2c18a8e
--- /dev/null
+++ b/src/TNL/Meshes/Readers/getMeshReader.h
@@ -0,0 +1,58 @@
+/***************************************************************************
+                          getMeshReader.h  -  description
+                             -------------------
+    begin                : Nov 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+#pragma once
+
+#include <experimental/filesystem>
+
+#include <TNL/Meshes/Readers/NetgenReader.h>
+#include <TNL/Meshes/Readers/VTKReader.h>
+#include <TNL/Meshes/Readers/VTUReader.h>
+#include <TNL/Meshes/Readers/PVTUReader.h>
+
+namespace TNL {
+namespace Meshes {
+namespace Readers {
+
+std::shared_ptr< Readers::MeshReader >
+getMeshReader( const std::string& fileName,
+               const std::string& fileFormat )
+{
+   namespace fs = std::experimental::filesystem;
+   std::string format = fileFormat;
+   if( format == "auto" ) {
+      format = fs::path(fileName).extension();
+      if( format.length() > 0 )
+         // remove dot from the extension
+         format = format.substr(1);
+   }
+
+   if( format == "ng" )
+      return std::make_shared< Readers::NetgenReader >( fileName );
+   else if( format == "vtk" )
+      return std::make_shared< Readers::VTKReader >( fileName );
+   else if( format == "vtu" )
+      return std::make_shared< Readers::VTUReader >( fileName );
+   else if( format == "pvtu" )
+      return std::make_shared< Readers::PVTUReader >( fileName );
+
+   if( fileFormat == "auto" )
+      std::cerr << "File '" << fileName << "' has unsupported format (based on the file extension): " << format << ".";
+   else
+      std::cerr << "Unsupported fileFormat parameter: " << fileFormat << ".";
+   std::cerr << " Supported formats are 'vtk', 'vtu', 'pvtu' and 'ng'." << std::endl;
+   return nullptr;
+}
+
+} // namespace Readers
+} // namespace Meshes
+} // namespace TNL
-- 
GitLab


From 0f95798ee8f044a904607b0c6d876230a950a46a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Nov 2020 22:13:09 +0100
Subject: [PATCH 16/50] Added parameter --redirect-mpi-output-dir to
 MpiCommunicator

---
 src/TNL/Communicators/MpiCommunicator.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index dd119e813..18143cce0 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -73,6 +73,7 @@ class MpiCommunicator
       {
 #ifdef HAVE_MPI
          config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true );
+         config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." );
          config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false );
          config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 );
 #endif
@@ -85,8 +86,9 @@ class MpiCommunicator
          if(IsInitialized())//i.e. - isUsed
          {
             const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
+            const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
             if( redirect )
-               setupRedirection();
+               setupRedirection( outputDirectory );
 #ifdef HAVE_CUDA
             int size;
             MPI_Comm_size( MPI_COMM_WORLD, &size );
@@ -152,15 +154,15 @@ class MpiCommunicator
          (void) NullRequest;
       }
 
-      static void setupRedirection()
+      static void setupRedirection( std::string outputDirectory )
       {
 #ifdef HAVE_MPI
          if(isDistributed() )
          {
             if(GetRank(AllGroup)!=0)
             {
-               const std::string stdoutFile = std::string("./stdout_") + std::to_string(GetRank(AllGroup)) + ".txt";
-               const std::string stderrFile = std::string("./stderr_") + std::to_string(GetRank(AllGroup)) + ".txt";
+               const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank(AllGroup)) + ".txt";
+               const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank(AllGroup)) + ".txt";
                std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
                Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
             }
-- 
GitLab


From 0052f917e114ebc2f75afa1d23ad7b433f4f3a2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 10 Nov 2020 20:03:19 +0100
Subject: [PATCH 17/50] pytnl: updated bindings for Mesh, added missing methods

---
 src/Python/pytnl/tnl/EntityTypes.h  | 36 -----------------------------
 src/Python/pytnl/tnl/Grid.h         | 16 ++++++-------
 src/Python/pytnl/tnl/Mesh.h         | 36 ++++++++++++++++++-----------
 src/Python/pytnl/tnl/mesh_getters.h | 36 +++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+), 57 deletions(-)
 delete mode 100644 src/Python/pytnl/tnl/EntityTypes.h
 create mode 100644 src/Python/pytnl/tnl/mesh_getters.h

diff --git a/src/Python/pytnl/tnl/EntityTypes.h b/src/Python/pytnl/tnl/EntityTypes.h
deleted file mode 100644
index 1f10e2827..000000000
--- a/src/Python/pytnl/tnl/EntityTypes.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-
-enum class EntityTypes { Cell, Face, Vertex };
-
-inline void
-export_EntityTypes( py::module & m )
-{
-    // avoid duplicate conversion -> export only once
-    static bool exported = false;
-    if( ! exported ) {
-        // TODO: should be nested types instead
-        py::enum_< EntityTypes >( m, "EntityTypes" )
-            .value("Cell", EntityTypes::Cell)
-            .value("Face", EntityTypes::Face)
-            .value("Vertex", EntityTypes::Vertex)
-        ;
-        exported = true;
-    }
-}
-
-template< typename Mesh >
-typename Mesh::GlobalIndexType
-mesh_getEntitiesCount( const Mesh & self, const EntityTypes & entity )
-{
-    if( entity == EntityTypes::Cell )
-        return self.template getEntitiesCount< typename Mesh::Cell >();
-    else if( entity == EntityTypes::Face )
-        return self.template getEntitiesCount< typename Mesh::Face >();
-    else if( entity == EntityTypes::Vertex )
-        return self.template getEntitiesCount< typename Mesh::Vertex >();
-    else
-        throw py::value_error("The entity parameter must be either Cell, Face or Vertex.");
-}
diff --git a/src/Python/pytnl/tnl/Grid.h b/src/Python/pytnl/tnl/Grid.h
index 8cf28a8f5..2622bd5c9 100644
--- a/src/Python/pytnl/tnl/Grid.h
+++ b/src/Python/pytnl/tnl/Grid.h
@@ -5,7 +5,7 @@ namespace py = pybind11;
 
 #include "StaticVector.h"
 #include "Grid_getSpaceStepsProducts.h"
-#include "EntityTypes.h"
+#include "mesh_getters.h"
 
 #include <type_traits>
 
@@ -54,8 +54,6 @@ void export_Grid( py::module & m, const char* name )
 //    void (Grid::* _setDimensions1)(const IndexType) = &Grid::setDimensions;
     void (Grid::* _setDimensions2)(const typename Grid::CoordinatesType &) = &Grid::setDimensions;
 
-    export_EntityTypes(m);
-
     auto grid = py::class_<Grid, TNL::Object>( m, name )
         .def(py::init<>())
         .def_static("getMeshDimension", &Grid::getMeshDimension)
@@ -68,11 +66,13 @@ void export_Grid( py::module & m, const char* name )
         .def("setDomain", &Grid::setDomain)
         .def("getOrigin", &Grid::getOrigin, py::return_value_policy::reference_internal)
         .def("getProportions", &Grid::getProportions, py::return_value_policy::reference_internal)
-        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid >)
-        // TODO: if combined, the return type would depend on the runtime parameter (entity)
-        .def("getEntity_cell", &Grid::template getEntity<typename Grid::Cell>)
-        .def("getEntity_face", &Grid::template getEntity<typename Grid::Face>)
-        .def("getEntity_vertex", &Grid::template getEntity<typename Grid::Vertex>)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Cell >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Face >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Vertex >)
+        // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity)
+        .def("getCell", &Grid::template getEntity<typename Grid::Cell>)
+        .def("getFace", &Grid::template getEntity<typename Grid::Face>)
+        .def("getVertex", &Grid::template getEntity<typename Grid::Vertex>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Cell>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Face>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Vertex>)
diff --git a/src/Python/pytnl/tnl/Mesh.h b/src/Python/pytnl/tnl/Mesh.h
index 21fa015fc..3097f111f 100644
--- a/src/Python/pytnl/tnl/Mesh.h
+++ b/src/Python/pytnl/tnl/Mesh.h
@@ -5,7 +5,7 @@ namespace py = pybind11;
 
 #include "../typedefs.h"
 #include "StaticVector.h"
-#include "EntityTypes.h"
+#include "mesh_getters.h"
 
 #include <TNL/String.h>
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
@@ -82,8 +82,11 @@ template< typename MeshEntity, typename Scope >
 void export_MeshEntity( Scope & scope, const char* name )
 {
     auto entity = py::class_< MeshEntity >( scope, name )
+//        .def(py::init<>())
+//        .def(py::init<typename MeshEntity::MeshType, typename MeshEntity::GlobalIndexType>())
         .def_static("getEntityDimension", &MeshEntity::getEntityDimension)
         .def("getIndex", &MeshEntity::getIndex)
+        .def("getTag", &MeshEntity::getTag)
         // TODO
     ;
 
@@ -95,23 +98,24 @@ void export_MeshEntity( Scope & scope, const char* name )
 template< typename Mesh >
 void export_Mesh( py::module & m, const char* name )
 {
-    // there are two templates - const and non-const - take only the const
-    auto (Mesh::* getEntity_cell)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Cell>;
-    auto (Mesh::* getEntity_face)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Face>;
-    auto (Mesh::* getEntity_vertex)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Vertex>;
-
-    export_EntityTypes(m);
-
     auto mesh = py::class_< Mesh, TNL::Object >( m, name )
         .def(py::init<>())
         .def_static("getMeshDimension", &Mesh::getMeshDimension)
         .def_static("getSerializationType", &Mesh::getSerializationType)
         .def("getSerializationTypeVirtual", &Mesh::getSerializationTypeVirtual)
-        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh >)
-        // TODO: if combined, the return type would depend on the runtime parameter (entity)
-        .def("getEntity_cell", getEntity_cell)
-        .def("getEntity_face", getEntity_face)
-        .def("getEntity_vertex", getEntity_vertex)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Cell >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Face >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Vertex >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Cell >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Face >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Vertex >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Cell >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Face >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Vertex >)
+        // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity)
+        .def("getCell", &Mesh::template getEntity<typename Mesh::Cell>)
+        .def("getFace", &Mesh::template getEntity<typename Mesh::Face>)
+        .def("getVertex", &Mesh::template getEntity<typename Mesh::Vertex>)
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ return getEntityCenter( mesh, cell ); } )
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Face& face ){ return getEntityCenter( mesh, face ); } )
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return getEntityCenter( mesh, vertex ); } )
@@ -124,6 +128,12 @@ void export_Mesh( py::module & m, const char* name )
                                        return mesh.template isBoundaryEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } )
         .def("isBoundaryEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){
                                         return mesh.template isBoundaryEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Cell& cell ){
+                                       return mesh.template isGhostEntity< Mesh::Cell::getEntityDimension() >( cell.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Face& face ){
+                                       return mesh.template isGhostEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){
+                                        return mesh.template isGhostEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } )
         // TODO: more?
     ;
 
diff --git a/src/Python/pytnl/tnl/mesh_getters.h b/src/Python/pytnl/tnl/mesh_getters.h
new file mode 100644
index 000000000..c5eddaa5e
--- /dev/null
+++ b/src/Python/pytnl/tnl/mesh_getters.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <type_traits>
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getEntitiesCount( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getEntitiesCount< EntityType::getEntityDimension() >();
+}
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getGhostEntitiesCount( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getGhostEntitiesCount< EntityType::getEntityDimension() >();
+}
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getGhostEntitiesOffset( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getGhostEntitiesOffset< EntityType::getEntityDimension() >();
+}
-- 
GitLab


From 40cd3071254dd505d3406423d6085348c4ccdfdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 14 Nov 2020 11:43:02 +0100
Subject: [PATCH 18/50] Refactored BiCGStab, added restarting

---
 src/TNL/Solvers/Linear/BICGStab.h      |   4 +
 src/TNL/Solvers/Linear/BICGStab_impl.h | 138 +++++++++++++------------
 2 files changed, 74 insertions(+), 68 deletions(-)

diff --git a/src/TNL/Solvers/Linear/BICGStab.h b/src/TNL/Solvers/Linear/BICGStab.h
index 2cede824a..474a45d02 100644
--- a/src/TNL/Solvers/Linear/BICGStab.h
+++ b/src/TNL/Solvers/Linear/BICGStab.h
@@ -37,6 +37,10 @@ public:
    bool solve( ConstVectorViewType b, VectorViewType x ) override;
 
 protected:
+   void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b );
+
+   void preconditioned_matvec( ConstVectorViewType src, VectorViewType dst );
+
    void setSize( const VectorViewType& x );
 
    bool exact_residue = false;
diff --git a/src/TNL/Solvers/Linear/BICGStab_impl.h b/src/TNL/Solvers/Linear/BICGStab_impl.h
index baa4b6363..ff3b42ed0 100644
--- a/src/TNL/Solvers/Linear/BICGStab_impl.h
+++ b/src/TNL/Solvers/Linear/BICGStab_impl.h
@@ -38,111 +38,80 @@ setup( const Config::ParameterContainer& parameters,
 }
 
 template< typename Matrix >
-bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x )
+bool
+BICGStab< Matrix >::
+solve( ConstVectorViewType b, VectorViewType x )
 {
    this->setSize( x );
 
-   RealType alpha, beta, omega, aux, rho, rho_old, b_norm;
+   RealType alpha, beta, omega, rho, rho_old, b_norm, r_ast_sqnorm;
 
+   // initialize the norm of the preconditioned right-hand-side
    if( this->preconditioner ) {
       this->preconditioner->solve( b, M_tmp );
       b_norm = lpNorm( M_tmp, 2.0 );
-
-      this->matrix->vectorProduct( x, M_tmp );
-      M_tmp = b - M_tmp;
-      this->preconditioner->solve( M_tmp, r );
    }
-   else {
+   else
       b_norm = lpNorm( b, 2.0 );
-      this->matrix->vectorProduct( x, r );
-      r = b - r;
-   }
+   if( b_norm == 0.0 )
+      b_norm = 1.0;
+
+   // r = M.solve(b - A * x);
+   compute_residue( r, x, b );
 
    p = r_ast = r;
    s.setValue( 0.0 );
-   rho = (r, r_ast);
+   r_ast_sqnorm = rho = (r, r_ast);
 
-   if( b_norm == 0.0 )
-       b_norm = 1.0;
+   const RealType eps2 = std::numeric_limits<RealType>::epsilon() * std::numeric_limits<RealType>::epsilon();
 
    this->resetIterations();
    this->setResidue( std::sqrt( rho ) / b_norm );
 
    while( this->nextIteration() )
    {
-      /****
-       * alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 )
-       */
-      if( this->preconditioner ) {
-         this->matrix->vectorProduct( p, M_tmp );
-         this->preconditioner->solve( M_tmp, Ap );
-      }
-      else {
-         this->matrix->vectorProduct( p, Ap );
-      }
-      aux = (Ap, r_ast);
-      alpha = rho / aux;
+      // alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 )
+      preconditioned_matvec( p, Ap );
+      alpha = rho / (Ap, r_ast);
 
-      /****
-       * s_j = r_j - alpha_j * A p_j
-       */
+      // s_j = r_j - alpha_j * A p_j
       s = r - alpha * Ap;
 
-      /****
-       * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
-       */
-      if( this->preconditioner ) {
-         this->matrix->vectorProduct( s, M_tmp );
-         this->preconditioner->solve( M_tmp, As );
-      }
-      else {
-         this->matrix->vectorProduct( s, As );
-      }
-      aux = lpNorm( As, 2.0 );
-      omega = (As, s) / (aux * aux);
+      // omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
+      preconditioned_matvec( s, As );
+      omega = (As, s) / (As, As);
 
-      /****
-       * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
-       */
+      // x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
       x += alpha * p + omega * s;
 
-      /****
-       * r_{j+1} = s_j - omega_j * A s_j
-       */
+      // r_{j+1} = s_j - omega_j * A s_j
       r = s - omega * As;
 
-      /****
-       * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
-       */
+      // compute scalar product of the residual vectors
       rho_old = rho;
       rho = (r, r_ast);
+      if( abs(rho) < eps2 * r_ast_sqnorm ) {
+         // The new residual vector has become too orthogonal to the arbitrarily chosen direction r_ast.
+         // Let's restart with a new r0:
+         compute_residue( r, x, b );
+         r_ast = r;
+         r_ast_sqnorm = rho = (r, r_ast);
+      }
+
+      // beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
       beta = (rho / rho_old) * (alpha / omega);
 
-      /****
-       * p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j )
-       */
+      // p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j )
       p = r + beta * p - (beta * omega) * Ap;
 
       if( exact_residue ) {
-         /****
-          * Compute the exact preconditioned residue into the 's' vector.
-          */
-         if( this->preconditioner ) {
-            this->matrix->vectorProduct( x, M_tmp );
-            M_tmp = b - M_tmp;
-            this->preconditioner->solve( M_tmp, s );
-         }
-         else {
-            this->matrix->vectorProduct( x, s );
-            s = b - s;
-         }
+         // Compute the exact preconditioned residue into the 's' vector.
+         compute_residue( s, x, b );
          const RealType residue = lpNorm( s, 2.0 );
          this->setResidue( residue / b_norm );
       }
       else {
-         /****
-          * Use the "orthogonal residue vector" for stopping.
-          */
+         // Use the "orthogonal residue vector" for stopping.
          const RealType residue = lpNorm( r, 2.0 );
          this->setResidue( residue / b_norm );
       }
@@ -153,7 +122,40 @@ bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x )
 }
 
 template< typename Matrix >
-void BICGStab< Matrix > :: setSize( const VectorViewType& x )
+void
+BICGStab< Matrix >::
+compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b )
+{
+   // r = M.solve(b - A * x);
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( x, M_tmp );
+      M_tmp = b - M_tmp;
+      this->preconditioner->solve( M_tmp, r );
+   }
+   else {
+      this->matrix->vectorProduct( x, r );
+      r = b - r;
+   }
+}
+
+template< typename Matrix >
+void
+BICGStab< Matrix >::
+preconditioned_matvec( ConstVectorViewType src, VectorViewType dst )
+{
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( src, M_tmp );
+      this->preconditioner->solve( M_tmp, dst );
+   }
+   else {
+      this->matrix->vectorProduct( src, dst );
+   }
+}
+
+template< typename Matrix >
+void
+BICGStab< Matrix >::
+setSize( const VectorViewType& x )
 {
    r.setLike( x );
    r_ast.setLike( x );
-- 
GitLab


From 4cf3545497fd6321351c8bfd4404d17fa69c79e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 16 Nov 2020 22:44:27 +0100
Subject: [PATCH 19/50] Refactoring DistributedArray - implementation via
 DistributedArrayView as a data member

---
 src/TNL/Containers/DistributedArray.h       |  4 +-
 src/TNL/Containers/DistributedArray.hpp     | 83 ++++++---------------
 src/TNL/Containers/DistributedArrayView.h   |  9 ++-
 src/TNL/Containers/DistributedArrayView.hpp | 18 +++++
 4 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 66dd8a8f0..31fc6d8a8 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -168,9 +168,7 @@ public:
    // TODO: serialization (save, load)
 
 protected:
-   LocalRangeType localRange;
-   IndexType globalSize = 0;
-   CommunicationGroup group = Communicator::NullGroup;
+   ViewType view;
    LocalArrayType localData;
 };
 
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index c146bbf9f..4910cbcd7 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -39,11 +39,9 @@ DistributedArray< Value, Device, Index, Communicator >::
 setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group )
 {
    TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" );
-   this->localRange = localRange;
-   this->globalSize = globalSize;
-   this->group = group;
    if( group != Communicator::NullGroup )
       localData.setSize( localRange.getSize() );
+   view.bind( localRange, globalSize, group, localData.getView() );
 }
 
 template< typename Value,
@@ -54,7 +52,7 @@ const Subrange< Index >&
 DistributedArray< Value, Device, Index, Communicator >::
 getLocalRange() const
 {
-   return localRange;
+   return view.getLocalRange();
 }
 
 template< typename Value,
@@ -65,7 +63,7 @@ typename Communicator::CommunicationGroup
 DistributedArray< Value, Device, Index, Communicator >::
 getCommunicationGroup() const
 {
-   return group;
+   return view.getCommunicationGroup();
 }
 
 template< typename Value,
@@ -99,18 +97,7 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 copyFromGlobal( ConstLocalViewType globalArray )
 {
-   TNL_ASSERT_EQ( getSize(), globalArray.getSize(),
-                  "given global array has different size than the distributed array" );
-
-   LocalViewType localView( localData );
-   const LocalRangeType localRange = getLocalRange();
-
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      localView[ i ] = globalArray[ localRange.getGlobalIndex( i ) ];
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel );
+   view.copyFromGlobal( globalArray );
 }
 
 
@@ -126,7 +113,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::ViewType
 DistributedArray< Value, Device, Index, Communicator >::
 getView()
 {
-   return ViewType( getLocalRange(), getSize(), getCommunicationGroup(), getLocalView() );
+   return view;
 }
 
 template< typename Value,
@@ -137,7 +124,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType
 DistributedArray< Value, Device, Index, Communicator >::
 getConstView() const
 {
-   return ConstViewType( getLocalRange(), getSize(), getCommunicationGroup(), getConstLocalView() );
+   return view.getConstView();
 }
 
 template< typename Value,
@@ -169,10 +156,8 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 setLike( const Array& array )
 {
-   localRange = array.getLocalRange();
-   globalSize = array.getSize();
-   group = array.getCommunicationGroup();
    localData.setLike( array.getConstLocalView() );
+   view.bind( array.getLocalRange(), array.getSize(), array.getCommunicationGroup(), localData.getView() );
 }
 
 template< typename Value,
@@ -183,9 +168,7 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 reset()
 {
-   localRange.reset();
-   globalSize = 0;
-   group = Communicator::NullGroup;
+   view.reset();
    localData.reset();
 }
 
@@ -197,7 +180,7 @@ bool
 DistributedArray< Value, Device, Index, Communicator >::
 empty() const
 {
-   return getSize() == 0;
+   return view.empty();
 }
 
 template< typename Value,
@@ -208,7 +191,7 @@ Index
 DistributedArray< Value, Device, Index, Communicator >::
 getSize() const
 {
-   return globalSize;
+   return view.getSize();
 }
 
 template< typename Value,
@@ -219,7 +202,7 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 setValue( ValueType value )
 {
-   localData.setValue( value );
+   view.setValue( value );
 }
 
 template< typename Value,
@@ -230,8 +213,7 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 setElement( IndexType i, ValueType value )
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   localData.setElement( li, value );
+   view.setElement( i, value );
 }
 
 template< typename Value,
@@ -242,8 +224,7 @@ Value
 DistributedArray< Value, Device, Index, Communicator >::
 getElement( IndexType i ) const
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData.getElement( li );
+   return view.getElement( i );
 }
 
 template< typename Value,
@@ -255,8 +236,7 @@ Value&
 DistributedArray< Value, Device, Index, Communicator >::
 operator[]( IndexType i )
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData[ li ];
+   return view[ i ];
 }
 
 template< typename Value,
@@ -268,8 +248,7 @@ const Value&
 DistributedArray< Value, Device, Index, Communicator >::
 operator[]( IndexType i ) const
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData[ li ];
+   return view[ i ];
 }
 
 template< typename Value,
@@ -281,7 +260,7 @@ DistributedArray< Value, Device, Index, Communicator >::
 operator=( const DistributedArray& array )
 {
    setLike( array );
-   localData = array.getConstLocalView();
+   view = array;
    return *this;
 }
 
@@ -295,7 +274,7 @@ DistributedArray< Value, Device, Index, Communicator >::
 operator=( const Array& array )
 {
    setLike( array );
-   localData = array.getConstLocalView();
+   view = array;
    return *this;
 }
 
@@ -308,17 +287,7 @@ bool
 DistributedArray< Value, Device, Index, Communicator >::
 operator==( const Array& array ) const
 {
-   // we can't run allreduce if the communication groups are different
-   if( group != array.getCommunicationGroup() )
-      return false;
-   const bool localResult =
-         localRange == array.getLocalRange() &&
-         globalSize == array.getSize() &&
-         localData == array.getConstLocalView();
-   bool result = true;
-   if( group != CommunicatorType::NullGroup )
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   return result;
+   return view == array;
 }
 
 template< typename Value,
@@ -330,7 +299,7 @@ bool
 DistributedArray< Value, Device, Index, Communicator >::
 operator!=( const Array& array ) const
 {
-   return ! (*this == array);
+   return view != array;
 }
 
 template< typename Value,
@@ -341,12 +310,7 @@ bool
 DistributedArray< Value, Device, Index, Communicator >::
 containsValue( ValueType value ) const
 {
-   bool result = false;
-   if( group != CommunicatorType::NullGroup ) {
-      const bool localResult = localData.containsValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group );
-   }
-   return result;
+   return view.containsValue( value );
 }
 
 template< typename Value,
@@ -357,12 +321,7 @@ bool
 DistributedArray< Value, Device, Index, Communicator >::
 containsOnlyValue( ValueType value ) const
 {
-   bool result = true;
-   if( group != CommunicatorType::NullGroup ) {
-      const bool localResult = localData.containsOnlyValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   }
-   return result;
+   return view.containsOnlyValue( value );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index e17467bef..345d4c13c 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -74,9 +74,12 @@ public:
    __cuda_callable__
    DistributedArrayView( DistributedArrayView&& ) = default;
 
-   // method for rebinding (reinitialization)
-   // Note that you can also bind directly to Array and other types implicitly
-   // convertible to ArrayView.
+   // method for rebinding (reinitialization) to raw data
+   __cuda_callable__
+   void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData );
+
+   // Note that you can also bind directly to DistributedArray and other types implicitly
+   // convertible to DistributedArrayView.
    __cuda_callable__
    void bind( DistributedArrayView view );
 
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 0199229d4..3890764a2 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -31,6 +31,24 @@ DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communi
   localData( view.getConstLocalView() )
 {}
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+__cuda_callable__
+void
+DistributedArrayView< Value, Device, Index, Communicator >::
+bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
+{
+   TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(),
+                  "The local array size does not match the local range of the distributed array." );
+
+   this->localRange = localRange;
+   this->globalSize = globalSize;
+   this->group = group;
+   this->localData.bind( localData );
+}
+
 template< typename Value,
           typename Device,
           typename Index,
-- 
GitLab


From 87772050729688183efcbb1e04c903fa84dda6ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 18 Nov 2020 22:42:44 +0100
Subject: [PATCH 20/50] Cleaned up the implementation of DistributedVector

---
 src/TNL/Containers/DistributedVector.hpp | 76 ++++--------------------
 1 file changed, 13 insertions(+), 63 deletions(-)

diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index fa49591e8..b2c7de038 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -48,7 +48,7 @@ typename DistributedVector< Value, Device, Index, Communicator >::ViewType
 DistributedVector< Value, Device, Index, Communicator >::
 getView()
 {
-   return ViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getLocalView() );
+   return BaseType::getView();
 }
 
 template< typename Value,
@@ -59,7 +59,7 @@ typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType
 DistributedVector< Value, Device, Index, Communicator >::
 getConstView() const
 {
-   return ConstViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getConstLocalView() );
+   return BaseType::getConstView();
 }
 
 template< typename Value,
@@ -97,9 +97,7 @@ DistributedVector< Real, Device, Index, Communicator >::
 operator=( const Vector& vector )
 {
    this->setLike( vector );
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = vector.getConstLocalView();
-   }
+   getView() = vector;
    return *this;
 }
 
@@ -112,16 +110,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator+=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += vector.getConstLocalView();
-   }
+   getView() += vector;
    return *this;
 }
 
@@ -134,16 +123,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator-=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= vector.getConstLocalView();
-   }
+   getView() -= vector;
    return *this;
 }
 
@@ -156,16 +136,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator*=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= vector.getConstLocalView();
-   }
+   getView() *= vector;
    return *this;
 }
 
@@ -178,16 +149,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator/=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= vector.getConstLocalView();
-   }
+   getView() /= vector;
    return *this;
 }
 
@@ -200,9 +162,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = c;
-   }
+   getView() = c;
    return *this;
 }
 
@@ -215,9 +175,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator+=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += c;
-   }
+   getView() += c;
    return *this;
 }
 
@@ -230,9 +188,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator-=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= c;
-   }
+   getView() -= c;
    return *this;
 }
 
@@ -245,9 +201,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator*=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= c;
-   }
+   getView() *= c;
    return *this;
 }
 
@@ -260,9 +214,7 @@ DistributedVector< Real, Device, Index, Communicator >&
 DistributedVector< Real, Device, Index, Communicator >::
 operator/=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= c;
-   }
+   getView() /= c;
    return *this;
 }
 
@@ -275,9 +227,7 @@ void
 DistributedVector< Real, Device, Index, Communicator >::
 scan( IndexType begin, IndexType end )
 {
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
+   getView().template scan< Type >( begin, end );
 }
 
 } // namespace Containers
-- 
GitLab


From 977f08fd46a35aefa7ab5d8368e06ae273c28287 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 17 Nov 2020 10:49:47 +0100
Subject: [PATCH 21/50] Removed __cuda_callable__ from methods in
 DistributedArrayView, DistributedVectorView and DistributedMatrix

Distributed data structures are not supposed to be passed to device
kernels. Distributed data structures are operated by the host, which
uses the device for parallel processing only in the local data
structures.
---
 src/TNL/Containers/DistributedArrayView.h           | 13 +------------
 src/TNL/Containers/DistributedArrayView.hpp         |  5 -----
 src/TNL/Containers/DistributedVectorView.h          |  4 ----
 src/TNL/Containers/DistributedVectorView.hpp        |  2 --
 src/TNL/Matrices/DistributedMatrix.h                |  9 ---------
 src/TNL/Matrices/DistributedMatrix_impl.h           |  9 ---------
 src/TNL/Solvers/Linear/GMRES_impl.h                 |  8 ++++----
 .../Solvers/Linear/Preconditioners/Diagonal_impl.h  |  9 ++++++---
 8 files changed, 11 insertions(+), 48 deletions(-)

diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 345d4c13c..1936c8d58 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -48,7 +48,6 @@ public:
 
 
    // Initialization by raw data
-   __cuda_callable__
    DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
    : localRange(localRange), globalSize(globalSize), group(group), localData(localData)
    {
@@ -56,31 +55,23 @@ public:
                      "The local array size does not match the local range of the distributed array." );
    }
 
-   __cuda_callable__
    DistributedArrayView() = default;
 
-   // Copy-constructor does shallow copy, so views can be passed-by-value into
-   // CUDA kernels and they can be captured-by-value in __cuda_callable__
-   // lambda functions.
-   __cuda_callable__
+   // Copy-constructor does shallow copy.
    DistributedArrayView( const DistributedArrayView& ) = default;
 
    // "Templated copy-constructor" accepting any cv-qualification of Value
    template< typename Value_ >
-   __cuda_callable__
    DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& );
 
    // default move-constructor
-   __cuda_callable__
    DistributedArrayView( DistributedArrayView&& ) = default;
 
    // method for rebinding (reinitialization) to raw data
-   __cuda_callable__
    void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData );
 
    // Note that you can also bind directly to DistributedArray and other types implicitly
    // convertible to DistributedArrayView.
-   __cuda_callable__
    void bind( DistributedArrayView view );
 
    // binding to local array via raw pointer
@@ -91,13 +82,11 @@ public:
    /**
     * \brief Returns a modifiable view of the array view.
     */
-   __cuda_callable__
    ViewType getView();
 
    /**
     * \brief Returns a non-modifiable view of the array view.
     */
-   __cuda_callable__
    ConstViewType getConstView() const;
 
 
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 3890764a2..0a206054c 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -22,7 +22,6 @@ template< typename Value,
           typename Index,
           typename Communicator >
    template< typename Value_ >
-__cuda_callable__
 DistributedArrayView< Value, Device, Index, Communicator >::
 DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view )
 : localRange( view.getLocalRange() ),
@@ -35,7 +34,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 void
 DistributedArrayView< Value, Device, Index, Communicator >::
 bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
@@ -53,7 +51,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 void
 DistributedArrayView< Value, Device, Index, Communicator >::
 bind( DistributedArrayView view )
@@ -82,7 +79,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType
 DistributedArrayView< Value, Device, Index, Communicator >::
 getView()
@@ -94,7 +90,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType
 DistributedArrayView< Value, Device, Index, Communicator >::
 getConstView() const
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 157a64b94..cb46f59c3 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -58,12 +58,10 @@ public:
    // In C++14, default constructors cannot be inherited, although Clang
    // and GCC since version 7.0 inherit them.
    // https://stackoverflow.com/a/51854172
-   __cuda_callable__
    DistributedVectorView() = default;
 
    // initialization by base class is not a copy constructor so it has to be explicit
    template< typename Real_ >  // template catches both const and non-const qualified Element
-   __cuda_callable__
    DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view )
    : BaseType( view ) {}
 
@@ -74,13 +72,11 @@ public:
    /**
     * \brief Returns a modifiable view of the array view.
     */
-   __cuda_callable__
    ViewType getView();
 
    /**
     * \brief Returns a non-modifiable view of the array view.
     */
-   __cuda_callable__
    ConstViewType getConstView() const;
 
    /*
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 70f61979f..0e32343a4 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -44,7 +44,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType
 DistributedVectorView< Value, Device, Index, Communicator >::
 getView()
@@ -56,7 +55,6 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Communicator >
-__cuda_callable__
 typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType
 DistributedVectorView< Value, Device, Index, Communicator >::
 getConstView() const
diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index faa220da6..5731d11ca 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -72,16 +72,12 @@ public:
 
    void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup );
 
-   __cuda_callable__
    const LocalRangeType& getLocalRowRange() const;
 
-   __cuda_callable__
    CommunicationGroup getCommunicationGroup() const;
 
-   __cuda_callable__
    const Matrix& getLocalMatrix() const;
 
-   __cuda_callable__
    Matrix& getLocalMatrix();
 
 
@@ -99,10 +95,8 @@ public:
 
    void reset();
 
-   __cuda_callable__
    IndexType getRows() const;
 
-   __cuda_callable__
    IndexType getColumns() const;
 
    template< typename RowCapacitiesVector >
@@ -120,14 +114,11 @@ public:
    RealType getElement( IndexType row,
                         IndexType column ) const;
 
-   __cuda_callable__
    RealType getElementFast( IndexType row,
                             IndexType column ) const;
 
-   __cuda_callable__
    MatrixRow getRow( IndexType row );
 
-   __cuda_callable__
    ConstMatrixRow getRow( IndexType row ) const;
 
    // multiplication with a global vector
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index 806703ca6..40a675f1a 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -42,7 +42,6 @@ setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 const Containers::Subrange< typename Matrix::IndexType >&
 DistributedMatrix< Matrix, Communicator >::
 getLocalRowRange() const
@@ -52,7 +51,6 @@ getLocalRowRange() const
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename Communicator::CommunicationGroup
 DistributedMatrix< Matrix, Communicator >::
 getCommunicationGroup() const
@@ -62,7 +60,6 @@ getCommunicationGroup() const
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 const Matrix&
 DistributedMatrix< Matrix, Communicator >::
 getLocalMatrix() const
@@ -72,7 +69,6 @@ getLocalMatrix() const
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 Matrix&
 DistributedMatrix< Matrix, Communicator >::
 getLocalMatrix()
@@ -139,7 +135,6 @@ reset()
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
 getRows() const
@@ -149,7 +144,6 @@ getRows() const
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
 getColumns() const
@@ -224,7 +218,6 @@ getElement( IndexType row,
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename Matrix::RealType
 DistributedMatrix< Matrix, Communicator >::
 getElementFast( IndexType row,
@@ -236,7 +229,6 @@ getElementFast( IndexType row,
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename DistributedMatrix< Matrix, Communicator >::MatrixRow
 DistributedMatrix< Matrix, Communicator >::
 getRow( IndexType row )
@@ -247,7 +239,6 @@ getRow( IndexType row )
 
 template< typename Matrix,
           typename Communicator >
-__cuda_callable__
 typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow
 DistributedMatrix< Matrix, Communicator >::
 getRow( IndexType row ) const
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 02a122a5d..23b563940 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -477,20 +477,20 @@ hauseholder_generate( const int i,
                       ConstVectorViewType z )
 {
    // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
+   ConstDeviceView z_local = Traits::getConstLocalView( z );
+   DeviceView y_i_local = Traits::getLocalView( y_i );
    if( localOffset == 0 ) {
       TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" );
       auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable
       {
          if( j < i )
-            y_i[ j ] = 0.0;
+            y_i_local[ j ] = 0.0;
          else
-            y_i[ j ] = z[ j ];
+            y_i_local[ j ] = z_local[ j ];
       };
       Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation );
    }
    else {
-      ConstDeviceView z_local = Traits::getConstLocalView( z );
-      DeviceView y_i_local = Traits::getLocalView( y_i );
       y_i_local = z_local;
    }
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index 788fc228d..f30151548 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -71,12 +71,15 @@ update( const MatrixPointer& matrixPointer )
    diagonal.setSize( matrixPointer->getLocalMatrix().getRows() );
 
    LocalViewType diag_view( diagonal );
-   const MatrixType* kernel_matrix = &matrixPointer.template getData< DeviceType >();
+   // FIXME: SparseMatrix::getConstView is broken
+//   const auto matrix_view = matrixPointer->getLocalMatrix().getConstView();
+   const auto matrix_view = matrixPointer->getLocalMatrix().getView();
+   const auto row_range = matrixPointer->getLocalRowRange();
 
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
    {
-      const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i );
-      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi );
+      const IndexType gi = row_range.getGlobalIndex( i );
+      diag_view[ i ] = matrix_view.getElement( i, gi );
    };
 
    Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
-- 
GitLab


From 98fe52f6fe715c29aa381214501bf403dd5cbfa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 17 Nov 2020 14:27:35 +0100
Subject: [PATCH 22/50] Reordered methods in DistributedArrayView

---
 src/TNL/Containers/DistributedArrayView.h   |  40 ++++----
 src/TNL/Containers/DistributedArrayView.hpp | 108 ++++++++++----------
 2 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 1936c8d58..86395517d 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -79,27 +79,6 @@ public:
    template< typename Value_ >
    void bind( Value_* data, IndexType localSize );
 
-   /**
-    * \brief Returns a modifiable view of the array view.
-    */
-   ViewType getView();
-
-   /**
-    * \brief Returns a non-modifiable view of the array view.
-    */
-   ConstViewType getConstView() const;
-
-
-   // Copy-assignment does deep copy, just like regular array, but the sizes
-   // must match (i.e. copy-assignment cannot resize).
-   DistributedArrayView& operator=( const DistributedArrayView& view );
-
-   template< typename Array,
-             typename...,
-             typename = std::enable_if_t< HasSubscriptOperator<Array>::value > >
-   DistributedArrayView& operator=( const Array& array );
-
-
    const LocalRangeType& getLocalRange() const;
 
    CommunicationGroup getCommunicationGroup() const;
@@ -115,6 +94,16 @@ public:
     * Usual ArrayView methods follow below.
     */
 
+   /**
+    * \brief Returns a modifiable view of the array view.
+    */
+   ViewType getView();
+
+   /**
+    * \brief Returns a non-modifiable view of the array view.
+    */
+   ConstViewType getConstView() const;
+
    // Resets the array view to the empty state.
    void reset();
 
@@ -143,6 +132,15 @@ public:
    __cuda_callable__
    const ValueType& operator[]( IndexType i ) const;
 
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   DistributedArrayView& operator=( const DistributedArrayView& view );
+
+   template< typename Array,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Array>::value > >
+   DistributedArrayView& operator=( const Array& array );
+
    // Comparison operators
    template< typename Array >
    bool operator==( const Array& array ) const;
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 0a206054c..81583541c 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -75,61 +75,6 @@ bind( Value_* data, IndexType localSize )
    localData.bind( data, localSize );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getView()
-{
-   return *this;
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getConstView() const
-{
-   return *this;
-}
-
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Communicator >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
-operator=( const DistributedArrayView& view )
-{
-   TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
-   localData = view.getConstLocalView();
-   return *this;
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Communicator >
-   template< typename Array, typename..., typename >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
-operator=( const Array& array )
-{
-   TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
-   localData = array.getConstLocalView();
-   return *this;
-}
-
-
 template< typename Value,
           typename Device,
           typename Index,
@@ -197,6 +142,28 @@ copyFromGlobal( ConstLocalViewType globalArray )
 }
 
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType
+DistributedArrayView< Value, Device, Index, Communicator >::
+getView()
+{
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType
+DistributedArrayView< Value, Device, Index, Communicator >::
+getConstView() const
+{
+   return *this;
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -296,6 +263,37 @@ operator[]( IndexType i ) const
    return localData[ li ];
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+DistributedArrayView< Value, Device, Index, Communicator >&
+DistributedArrayView< Value, Device, Index, Communicator >::
+operator=( const DistributedArrayView& view )
+{
+   TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
+   localData = view.getConstLocalView();
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+   template< typename Array, typename..., typename >
+DistributedArrayView< Value, Device, Index, Communicator >&
+DistributedArrayView< Value, Device, Index, Communicator >::
+operator=( const Array& array )
+{
+   TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
+   localData = array.getConstLocalView();
+   return *this;
+}
+
 template< typename Value,
           typename Device,
           typename Index,
-- 
GitLab


From 5184793e8b71e585ef191b10e54d9e54b551c4dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 17 Nov 2020 13:36:25 +0100
Subject: [PATCH 23/50] Added base class ByteArraySynchronizer

---
 src/TNL/Containers/ByteArraySynchronizer.h    | 32 +++++++++++++++
 .../DistributedMeshSynchronizer.h             | 39 +++++++++++++------
 2 files changed, 59 insertions(+), 12 deletions(-)
 create mode 100644 src/TNL/Containers/ByteArraySynchronizer.h

diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h
new file mode 100644
index 000000000..520820c02
--- /dev/null
+++ b/src/TNL/Containers/ByteArraySynchronizer.h
@@ -0,0 +1,32 @@
+/***************************************************************************
+                          ByteArraySynchronizer.h  -  description
+                             -------------------
+    begin                : November 17, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+#pragma once
+
+#include <TNL/Containers/ArrayView.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename Device, typename Index >
+class ByteArraySynchronizer
+{
+public:
+   using ByteArrayView = ArrayView< std::uint8_t, Device, Index >;
+
+   virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) = 0;
+
+   virtual ~ByteArraySynchronizer() = default;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 724510bf4..225d1a2df 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -12,6 +12,7 @@
 
 #pragma once
 
+#include <TNL/Containers/ByteArraySynchronizer.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Matrices/DenseMatrix.h>
 
@@ -32,11 +33,15 @@ struct HasMeshType< T, typename Containers::Expressions::enable_if_type< typenam
 template< typename DistributedMesh,
           int EntityDimension = DistributedMesh::getMeshDimension() >
 class DistributedMeshSynchronizer
+: public Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >
 {
+   using Base = Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >;
+
 public:
    using DeviceType = typename DistributedMesh::DeviceType;
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
    using CommunicatorType = typename DistributedMesh::CommunicatorType;
+   using ByteArrayView = typename Base::ByteArrayView;
 
    DistributedMeshSynchronizer() = default;
 
@@ -182,10 +187,20 @@ public:
    template< typename Array >
    void synchronizeArray( Array& array, int valuesPerElement = 1 )
    {
-      TNL_ASSERT_EQ( array.getSize(), valuesPerElement * ghostOffsets[ ghostOffsets.getSize() - 1 ],
-                     "The array does not have the expected size." );
+      static_assert( std::is_same< typename Array::DeviceType, DeviceType >::value,
+                     "mismatched DeviceType of the array" );
       using ValueType = typename Array::ValueType;
 
+      ByteArrayView view;
+      view.bind( reinterpret_cast<std::uint8_t*>( array.getData() ), sizeof(ValueType) * array.getSize() );
+      synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement );
+   }
+
+   virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override
+   {
+      TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ],
+                     "The array does not have the expected size." );
+
       // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
       #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
@@ -196,7 +211,7 @@ public:
       const int nproc = CommunicatorType::GetSize( group );
 
       // allocate send buffers (setSize does nothing if the array size is already correct)
-      sendBuffers.setSize( valuesPerElement * ghostNeighborOffsets[ nproc ] * sizeof(ValueType) );
+      sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] );
 
       // buffer for asynchronous communication requests
       std::vector< typename CommunicatorType::Request > requests;
@@ -205,20 +220,20 @@ public:
       for( int j = 0; j < nproc; j++ ) {
          if( ghostEntitiesCounts( rank, j ) > 0 ) {
             requests.push_back( CommunicatorType::IRecv(
-                     array.getData() + valuesPerElement * ghostOffsets[ j ],
-                     valuesPerElement * ghostEntitiesCounts( rank, j ),
+                     array.getData() + bytesPerValue * ghostOffsets[ j ],
+                     bytesPerValue * ghostEntitiesCounts( rank, j ),
                      j, 0, group ) );
          }
       }
 
-      Containers::ArrayView< ValueType, DeviceType, GlobalIndexType > sendBuffersView;
-      sendBuffersView.bind( reinterpret_cast<ValueType*>( sendBuffers.getData() ), valuesPerElement * ghostNeighborOffsets[ nproc ] );
+      ByteArrayView sendBuffersView;
+      sendBuffersView.bind( sendBuffers.getData(), bytesPerValue * ghostNeighborOffsets[ nproc ] );
       const auto ghostNeighborsView = ghostNeighbors.getConstView();
       const auto arrayView = array.getConstView();
-      auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, valuesPerElement] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable
+      auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, bytesPerValue] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable
       {
-         for( int i = 0; i < valuesPerElement; i++ )
-            sendBuffersView[ i + valuesPerElement * (offset + k) ] = arrayView[ i + valuesPerElement * ghostNeighborsView[ offset + k ] ];
+         for( int i = 0; i < bytesPerValue; i++ )
+            sendBuffersView[ i + bytesPerValue * (offset + k) ] = arrayView[ i + bytesPerValue * ghostNeighborsView[ offset + k ] ];
       };
 
       for( int i = 0; i < nproc; i++ ) {
@@ -229,8 +244,8 @@ public:
 
             // issue async send operation
             requests.push_back( CommunicatorType::ISend(
-                     sendBuffersView.getData() + valuesPerElement * ghostNeighborOffsets[ i ],
-                     valuesPerElement * ghostEntitiesCounts( i, rank ),
+                     sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ],
+                     bytesPerValue * ghostEntitiesCounts( i, rank ),
                      i, 0, group ) );
          }
       }
-- 
GitLab


From f0b42e43af8c031df53673e70c4085a3cbf91ff0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 18 Nov 2020 22:48:37 +0100
Subject: [PATCH 24/50] Added support for ghost ranges to DistributedArray and
 DistributedVector and their views

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |   4 +-
 .../tnl-benchmark-linear-solvers.h            |   6 +-
 src/TNL/Containers/DistributedArray.h         |  60 ++++---
 src/TNL/Containers/DistributedArray.hpp       | 120 ++++++++++++--
 src/TNL/Containers/DistributedArrayView.h     |  36 ++++-
 src/TNL/Containers/DistributedArrayView.hpp   | 146 ++++++++++++++++--
 src/TNL/Containers/DistributedVector.h        |  19 ++-
 src/TNL/Containers/DistributedVector.hpp      |  22 +++
 src/TNL/Containers/DistributedVectorView.h    |  18 +++
 src/TNL/Containers/DistributedVectorView.hpp  |  63 +++++++-
 .../Expressions/DistributedComparison.h       |   6 +
 .../DistributedExpressionTemplates.h          | 146 +++++++++++++++++-
 src/TNL/Containers/Partitioner.h              |  66 +++++++-
 src/TNL/Matrices/DistributedMatrix_impl.h     |   2 +-
 .../Containers/DistributedArrayTest.h         |  80 ++++++++--
 .../Containers/DistributedVectorTest.h        |  14 +-
 .../Containers/VectorBinaryOperationsTest.h   |  24 ++-
 .../Containers/VectorHelperFunctions.h        |  14 +-
 .../Containers/VectorUnaryOperationsTest.h    |  35 ++++-
 .../Containers/VectorVerticalOperationsTest.h |  14 +-
 .../Matrices/DistributedMatrixTest.h          |   8 +-
 21 files changed, 792 insertions(+), 111 deletions(-)

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index b791b0100..74a3205d3 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -228,10 +228,10 @@ struct SpmvBenchmark
       const auto group = CommunicatorType::AllGroup;
       const auto localRange = Partitioner::splitRange( matrix.getRows(), group );
       DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group );
-      DistributedVector distributedVector( localRange, matrix.getRows(), group );
+      DistributedVector distributedVector( localRange, 0, matrix.getRows(), group );
 
       // copy the row lengths from the global matrix to the distributed matrix
-      DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group );
+      DistributedRowLengths distributedRowLengths( localRange, 0, matrix.getRows(), group );
       for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
          distributedRowLengths[ gi ] = matrix.getRowCapacity( gi );
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index cadb5a046..06ba2bc94 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -435,11 +435,11 @@ struct LinearSolversBenchmark
       const auto group = CommunicatorType::AllGroup;
       const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group );
       SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group );
-      DistributedVector dist_x0( localRange, matrixPointer->getRows(), group );
-      DistributedVector dist_b( localRange, matrixPointer->getRows(), group );
+      DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group );
+      DistributedVector dist_b( localRange, 0, matrixPointer->getRows(), group );
 
       // copy the row capacities from the global matrix to the distributed matrix
-      DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group );
+      DistributedRowLengths distributedRowLengths( localRange, 0, matrixPointer->getRows(), group );
       for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
          const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
          distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi );
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 31fc6d8a8..c1571bc9e 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -37,6 +37,7 @@ public:
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
    using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
    using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using SynchronizerType = typename ViewType::SynchronizerType;
 
    /**
     * \brief A template which allows to quickly obtain a \ref DistributedArray type with changed template parameters.
@@ -50,46 +51,54 @@ public:
 
    DistributedArray() = default;
 
-   DistributedArray( const DistributedArray& ) = default;
+   // Copy-constructor does deep copy.
+   DistributedArray( const DistributedArray& );
 
-   DistributedArray( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
 
-   void setDistribution( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
 
    const LocalRangeType& getLocalRange() const;
 
+   IndexType getGhosts() const;
+
    CommunicationGroup getCommunicationGroup() const;
 
    /**
     * \brief Returns a modifiable view of the local part of the array.
-    *
-    * If \e begin or \e end is set to a non-zero value, a view for the
-    * sub-interval `[begin, end)` is returned. Otherwise a view for whole
-    * local part of the array view is returned.
-    *
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
     */
    LocalViewType getLocalView();
 
    /**
     * \brief Returns a non-modifiable view of the local part of the array.
-    *
-    * If \e begin or \e end is set to a non-zero value, a view for the
-    * sub-interval `[begin, end)` is returned. Otherwise a view for whole
-    * local part of the array view is returned.
-    *
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
     */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the array,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the array,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    void copyFromGlobal( ConstLocalViewType globalArray );
 
+   // synchronizer stuff
+   void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 );
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const;
+
+   int getValuesPerElement() const;
+
+   void startSynchronization();
+
+   void waitForSynchronization() const;
+
 
    // Usual Array methods follow below.
 
@@ -170,6 +179,15 @@ public:
 protected:
    ViewType view;
    LocalArrayType localData;
+
+private:
+   template< typename Array, std::enable_if_t< std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true >
+   static void setSynchronizerHelper( ViewType& view, const Array& array )
+   {
+      view.setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
+   }
+   template< typename Array, std::enable_if_t< ! std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true >
+   static void setSynchronizerHelper( ViewType& view, const Array& array ) {}
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index 4910cbcd7..c23d0a7e4 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -25,9 +25,20 @@ template< typename Value,
           typename Index,
           typename Communicator >
 DistributedArray< Value, Device, Index, Communicator >::
-DistributedArray( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group )
+DistributedArray( const DistributedArray& array )
 {
-   setDistribution( localRange, globalSize, group );
+   setLike( array );
+   localData = array.getConstLocalViewWithGhosts();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group )
+{
+   setDistribution( localRange, ghosts, globalSize, group );
 }
 
 template< typename Value,
@@ -36,12 +47,12 @@ template< typename Value,
           typename Communicator >
 void
 DistributedArray< Value, Device, Index, Communicator >::
-setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group )
+setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group )
 {
    TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" );
    if( group != Communicator::NullGroup )
-      localData.setSize( localRange.getSize() );
-   view.bind( localRange, globalSize, group, localData.getView() );
+      localData.setSize( localRange.getSize() + ghosts );
+   view.bind( localRange, ghosts, globalSize, group, localData.getView() );
 }
 
 template< typename Value,
@@ -55,6 +66,17 @@ getLocalRange() const
    return view.getLocalRange();
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+Index
+DistributedArray< Value, Device, Index, Communicator >::
+getGhosts() const
+{
+   return view.getGhosts();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -74,7 +96,7 @@ typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType
 DistributedArray< Value, Device, Index, Communicator >::
 getLocalView()
 {
-   return localData.getView();
+   return view.getLocalView();
 }
 
 template< typename Value,
@@ -85,7 +107,29 @@ typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewT
 DistributedArray< Value, Device, Index, Communicator >::
 getConstLocalView() const
 {
-   return localData.getConstView();
+   return view.getConstLocalView();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType
+DistributedArray< Value, Device, Index, Communicator >::
+getLocalViewWithGhosts()
+{
+   return view.getLocalViewWithGhosts();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType
+DistributedArray< Value, Device, Index, Communicator >::
+getConstLocalViewWithGhosts() const
+{
+   return view.getConstLocalViewWithGhosts();
 }
 
 
@@ -100,6 +144,61 @@ copyFromGlobal( ConstLocalViewType globalArray )
    view.copyFromGlobal( globalArray );
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArray< Value, Device, Index, Communicator >::
+setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
+{
+   view.setSynchronizer( synchronizer, valuesPerElement );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType >
+DistributedArray< Value, Device, Index, Communicator >::
+getSynchronizer() const
+{
+   return view.getSynchronizer();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+int
+DistributedArray< Value, Device, Index, Communicator >::
+getValuesPerElement() const
+{
+   return view.getValuesPerElement();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArray< Value, Device, Index, Communicator >::
+startSynchronization()
+{
+   view.startSynchronization();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArray< Value, Device, Index, Communicator >::
+waitForSynchronization() const
+{
+   view.waitForSynchronization();
+}
+
 
 /*
  * Usual Array methods follow below.
@@ -156,8 +255,11 @@ void
 DistributedArray< Value, Device, Index, Communicator >::
 setLike( const Array& array )
 {
-   localData.setLike( array.getConstLocalView() );
-   view.bind( array.getLocalRange(), array.getSize(), array.getCommunicationGroup(), localData.getView() );
+   localData.setLike( array.getConstLocalViewWithGhosts() );
+   view.bind( array.getLocalRange(), array.getGhosts(), array.getSize(), array.getCommunicationGroup(), localData.getView() );
+   // set, but do not unset, the synchronizer
+   if( array.getSynchronizer() )
+      setSynchronizerHelper( view, array );
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 86395517d..bf63f8cc6 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -12,9 +12,12 @@
 
 #pragma once
 
+#include <memory>
+
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
+#include <TNL/Containers/ByteArraySynchronizer.h>
 
 namespace TNL {
 namespace Containers {
@@ -36,6 +39,7 @@ public:
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
    using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
    using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref DistributedArrayView type with changed template parameters.
@@ -48,11 +52,12 @@ public:
 
 
    // Initialization by raw data
-   DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
-   : localRange(localRange), globalSize(globalSize), group(group), localData(localData)
+   DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
+   : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData)
    {
-      TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(),
+      TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
                      "The local array size does not match the local range of the distributed array." );
+      TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." );
    }
 
    DistributedArrayView() = default;
@@ -68,27 +73,44 @@ public:
    DistributedArrayView( DistributedArrayView&& ) = default;
 
    // method for rebinding (reinitialization) to raw data
-   void bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData );
+   void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData );
 
    // Note that you can also bind directly to DistributedArray and other types implicitly
    // convertible to DistributedArrayView.
    void bind( DistributedArrayView view );
 
    // binding to local array via raw pointer
-   // (local range, global size and communication group are preserved)
+   // (local range, ghosts, global size and communication group are preserved)
    template< typename Value_ >
    void bind( Value_* data, IndexType localSize );
 
    const LocalRangeType& getLocalRange() const;
 
+   IndexType getGhosts() const;
+
    CommunicationGroup getCommunicationGroup() const;
 
    LocalViewType getLocalView();
 
    ConstLocalViewType getConstLocalView() const;
 
+   LocalViewType getLocalViewWithGhosts();
+
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    void copyFromGlobal( ConstLocalViewType globalArray );
 
+   // synchronizer stuff
+   void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 );
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const;
+
+   int getValuesPerElement() const;
+
+   void startSynchronization();
+
+   void waitForSynchronization() const;
+
 
    /*
     * Usual ArrayView methods follow below.
@@ -156,9 +178,13 @@ public:
 
 protected:
    LocalRangeType localRange;
+   IndexType ghosts = 0;
    IndexType globalSize = 0;
    CommunicationGroup group = Communicator::NullGroup;
    LocalViewType localData;
+
+   std::shared_ptr< SynchronizerType > synchronizer = nullptr;
+   int valuesPerElement = 1;
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 81583541c..cb95427fc 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -25,9 +25,12 @@ template< typename Value,
 DistributedArrayView< Value, Device, Index, Communicator >::
 DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view )
 : localRange( view.getLocalRange() ),
+  ghosts( view.getGhosts() ),
   globalSize( view.getSize() ),
   group( view.getCommunicationGroup() ),
-  localData( view.getConstLocalView() )
+  localData( view.getConstLocalViewWithGhosts() ),
+  synchronizer( view.getSynchronizer() ),
+  valuesPerElement( view.getValuesPerElement() )
 {}
 
 template< typename Value,
@@ -36,12 +39,14 @@ template< typename Value,
           typename Communicator >
 void
 DistributedArrayView< Value, Device, Index, Communicator >::
-bind( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
+bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
 {
-   TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(),
+   TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
                   "The local array size does not match the local range of the distributed array." );
+   TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." );
 
    this->localRange = localRange;
+   this->ghosts = ghosts;
    this->globalSize = globalSize;
    this->group = group;
    this->localData.bind( localData );
@@ -56,9 +61,13 @@ DistributedArrayView< Value, Device, Index, Communicator >::
 bind( DistributedArrayView view )
 {
    localRange = view.getLocalRange();
+   ghosts = view.getGhosts();
    globalSize = view.getSize();
    group = view.getCommunicationGroup();
-   localData.bind( view.getLocalView() );
+   localData.bind( view.getLocalViewWithGhosts() );
+   // set, but do not unset, the synchronizer
+   if( view.getSynchronizer() )
+      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
 }
 
 template< typename Value,
@@ -70,7 +79,7 @@ void
 DistributedArrayView< Value, Device, Index, Communicator >::
 bind( Value_* data, IndexType localSize )
 {
-   TNL_ASSERT_EQ( localSize, localRange.getSize(),
+   TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts,
                   "The local array size does not match the local range of the distributed array." );
    localData.bind( data, localSize );
 }
@@ -86,6 +95,17 @@ getLocalRange() const
    return localRange;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+Index
+DistributedArrayView< Value, Device, Index, Communicator >::
+getGhosts() const
+{
+   return ghosts;
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -105,7 +125,7 @@ typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewTy
 DistributedArrayView< Value, Device, Index, Communicator >::
 getLocalView()
 {
-   return localData;
+   return LocalViewType( localData.getData(), localRange.getSize() );
 }
 
 template< typename Value,
@@ -115,6 +135,28 @@ template< typename Value,
 typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType
 DistributedArrayView< Value, Device, Index, Communicator >::
 getConstLocalView() const
+{
+   return ConstLocalViewType( localData.getData(), localRange.getSize() );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType
+DistributedArrayView< Value, Device, Index, Communicator >::
+getLocalViewWithGhosts()
+{
+   return localData;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType
+DistributedArrayView< Value, Device, Index, Communicator >::
+getConstLocalViewWithGhosts() const
 {
    return localData;
 }
@@ -130,7 +172,7 @@ copyFromGlobal( ConstLocalViewType globalArray )
    TNL_ASSERT_EQ( getSize(), globalArray.getSize(),
                   "given global array has different size than the distributed array view" );
 
-   LocalViewType localView( localData );
+   LocalViewType localView = getLocalView();
    const LocalRangeType localRange = getLocalRange();
 
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
@@ -139,6 +181,78 @@ copyFromGlobal( ConstLocalViewType globalArray )
    };
 
    Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel );
+   startSynchronization();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArrayView< Value, Device, Index, Communicator >::
+setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
+{
+   this->synchronizer = synchronizer;
+   this->valuesPerElement = valuesPerElement;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType >
+DistributedArrayView< Value, Device, Index, Communicator >::
+getSynchronizer() const
+{
+   return synchronizer;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+int
+DistributedArrayView< Value, Device, Index, Communicator >::
+getValuesPerElement() const
+{
+   return valuesPerElement;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArrayView< Value, Device, Index, Communicator >::
+startSynchronization()
+{
+   if( ghosts == 0 )
+      return;
+   // TODO: assert does not play very nice with automatic synchronizations from operations like
+   //       assignment of scalars
+   // (Maybe we should just drop all automatic syncs? But that's not nice for high-level codes
+   // like linear solvers...)
+   TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" );
+
+   // wait for any previous synchronization (in case the array was inconsistently modified
+   // while a synchronization was in progress)
+   waitForSynchronization();
+
+   typename SynchronizerType::ByteArrayView bytes;
+   bytes.bind( reinterpret_cast<std::uint8_t*>( localData.getData() ), sizeof(ValueType) * localData.getSize() );
+   // TODO: implement the async stuff
+   synchronizer->synchronizeByteArray( bytes, sizeof(ValueType) * valuesPerElement );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+void
+DistributedArrayView< Value, Device, Index, Communicator >::
+waitForSynchronization() const
+{
+   // TODO: implement the async stuff
 }
 
 
@@ -173,6 +287,7 @@ DistributedArrayView< Value, Device, Index, Communicator >::
 reset()
 {
    localRange.reset();
+   ghosts = 0;
    globalSize = 0;
    group = Communicator::NullGroup;
    localData.reset();
@@ -211,6 +326,7 @@ DistributedArrayView< Value, Device, Index, Communicator >::
 setValue( ValueType value )
 {
    localData.setValue( value );
+   startSynchronization();
 }
 
 template< typename Value,
@@ -273,8 +389,12 @@ operator=( const DistributedArrayView& view )
 {
    TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
-   localData = view.getConstLocalView();
+   localData = view.getConstLocalViewWithGhosts();
+   // set, but do not unset, the synchronizer
+   if( view.getSynchronizer() )
+      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
    return *this;
 }
 
@@ -289,8 +409,12 @@ operator=( const Array& array )
 {
    TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
-   localData = array.getConstLocalView();
+   localData = array.getConstLocalViewWithGhosts();
+   // set, but do not unset, the synchronizer
+   if( array.getSynchronizer() )
+      setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
    return *this;
 }
 
@@ -308,8 +432,10 @@ operator==( const Array& array ) const
       return false;
    const bool localResult =
          localRange == array.getLocalRange() &&
+         ghosts == array.getGhosts() &&
          globalSize == array.getSize() &&
-         localData == array.getConstLocalView();
+         // compare without ghosts
+         getConstLocalView() == array.getConstLocalView();
    bool result = true;
    if( group != CommunicatorType::NullGroup )
       CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 5d5f8303f..32dc80125 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -75,11 +75,28 @@ public:
     */
    DistributedVector& operator=( DistributedVector&& ) = default;
 
-   // we return only the view so that the user cannot resize it
+   /**
+    * \brief Returns a modifiable view of the local part of the vector.
+    */
    LocalViewType getLocalView();
 
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector.
+    */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    /**
     * \brief Returns a modifiable view of the vector.
     */
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index b2c7de038..cbbc763ec 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -40,6 +40,28 @@ getConstLocalView() const
    return BaseType::getConstLocalView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType
+DistributedVector< Real, Device, Index, Communicator >::
+getLocalViewWithGhosts()
+{
+   return BaseType::getLocalViewWithGhosts();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType
+DistributedVector< Real, Device, Index, Communicator >::
+getConstLocalViewWithGhosts() const
+{
+   return BaseType::getConstLocalViewWithGhosts();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index cb46f59c3..6be52d9db 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -65,10 +65,28 @@ public:
    DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view )
    : BaseType( view ) {}
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector.
+    */
    LocalViewType getLocalView();
 
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector.
+    */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    /**
     * \brief Returns a modifiable view of the array view.
     */
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 0e32343a4..f1a6fb1e5 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -40,6 +40,28 @@ getConstLocalView() const
    return BaseType::getConstLocalView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType
+DistributedVectorView< Real, Device, Index, Communicator >::
+getLocalViewWithGhosts()
+{
+   return BaseType::getLocalViewWithGhosts();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Communicator >
+typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType
+DistributedVectorView< Real, Device, Index, Communicator >::
+getConstLocalViewWithGhosts() const
+{
+   return BaseType::getConstLocalViewWithGhosts();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -80,11 +102,16 @@ operator=( const Vector& vector )
                   "The sizes of the array views must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "The communication groups of the array views must be equal." );
 
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = vector.getConstLocalView();
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() = vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
@@ -102,11 +129,16 @@ operator+=( const Vector& vector )
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += vector.getConstLocalView();
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() += vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
@@ -124,11 +156,16 @@ operator-=( const Vector& vector )
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= vector.getConstLocalView();
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() -= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
@@ -146,11 +183,16 @@ operator*=( const Vector& vector )
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= vector.getConstLocalView();
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() *= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
@@ -168,11 +210,16 @@ operator/=( const Vector& vector )
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= vector.getConstLocalView();
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() /= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
@@ -188,6 +235,7 @@ operator=( Scalar c )
 {
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
       getLocalView() = c;
+      this->startSynchronization();
    }
    return *this;
 }
@@ -203,6 +251,7 @@ operator+=( Scalar c )
 {
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
       getLocalView() += c;
+      this->startSynchronization();
    }
    return *this;
 }
@@ -218,6 +267,7 @@ operator-=( Scalar c )
 {
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
       getLocalView() -= c;
+      this->startSynchronization();
    }
    return *this;
 }
@@ -233,6 +283,7 @@ operator*=( Scalar c )
 {
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
       getLocalView() *= c;
+      this->startSynchronization();
    }
    return *this;
 }
@@ -248,6 +299,7 @@ operator/=( Scalar c )
 {
    if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
       getLocalView() /= c;
+      this->startSynchronization();
    }
    return *this;
 }
@@ -264,6 +316,7 @@ scan( IndexType begin, IndexType end )
    if( end == 0 )
       end = this->getSize();
    Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
+   this->startSynchronization();
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h
index 4cecc92bb..1cef0873d 100644
--- a/src/TNL/Containers/Expressions/DistributedComparison.h
+++ b/src/TNL/Containers/Expressions/DistributedComparison.h
@@ -38,7 +38,9 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult =
             a.getLocalRange() == b.getLocalRange() &&
+            a.getGhosts() == b.getGhosts() &&
             a.getSize() == b.getSize() &&
+            // compare without ghosts
             a.getConstLocalView() == b.getConstLocalView();
       bool result = true;
       if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
@@ -55,6 +57,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
@@ -70,6 +73,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
@@ -85,6 +89,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
@@ -100,6 +105,7 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 1802dcc95..25175a467 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -10,6 +10,7 @@
 
 #pragma once
 #include <utility>
+#include <memory>
 
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
 #include <TNL/Containers/Expressions/DistributedComparison.h>
@@ -64,6 +65,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType,
                                                         typename T2::ConstLocalViewType,
                                                         Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." );
@@ -79,13 +81,16 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
                      "Attempt to mix operands with different sizes." );
       TNL_ASSERT_EQ( op1.getLocalRange(), op2.getLocalRange(),
                      "Distributed expressions are supported only on vectors which are distributed the same way." );
+      TNL_ASSERT_EQ( op1.getGhosts(), op2.getGhosts(),
+                     "Distributed expressions are supported only on vectors which are distributed the same way." );
       TNL_ASSERT_EQ( op1.getCommunicationGroup(), op2.getCommunicationGroup(),
                      "Distributed expressions are supported only on vectors within the same communication group." );
    }
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -105,6 +110,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getLocalRange();
    }
 
+   IndexType getGhosts() const
+   {
+      return op1.getGhosts();
+   }
+
    CommunicationGroup getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
@@ -115,6 +125,27 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return ConstLocalViewType( op1.getConstLocalView(), op2.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op1.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op1.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op1.waitForSynchronization();
+      op2.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -132,6 +163,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." );
@@ -141,7 +173,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -161,6 +194,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getLocalRange();
    }
 
+   IndexType getGhosts() const
+   {
+      return op1.getGhosts();
+   }
+
    CommunicationGroup getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
@@ -171,6 +209,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return ConstLocalViewType( op1.getConstLocalView(), op2 );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2 );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op1.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op1.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op1.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -188,6 +246,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
    using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T2::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >;
+   using SynchronizerType = typename T2::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T2 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the right operand." );
@@ -197,7 +256,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -217,6 +277,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
       return op2.getLocalRange();
    }
 
+   IndexType getGhosts() const
+   {
+      return op2.getGhosts();
+   }
+
    CommunicationGroup getCommunicationGroup() const
    {
       return op2.getCommunicationGroup();
@@ -227,6 +292,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
       return ConstLocalViewType( op1, op2.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1, op2.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op2.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op2.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op2.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -245,6 +330,7 @@ struct DistributedUnaryExpressionTemplate
    using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed unary expression templates - distributed expression templates are not enabled for the operand." );
@@ -254,7 +340,8 @@ struct DistributedUnaryExpressionTemplate
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -274,6 +361,11 @@ struct DistributedUnaryExpressionTemplate
       return operand.getLocalRange();
    }
 
+   IndexType getGhosts() const
+   {
+      return operand.getGhosts();
+   }
+
    CommunicationGroup getCommunicationGroup() const
    {
       return operand.getCommunicationGroup();
@@ -284,6 +376,26 @@ struct DistributedUnaryExpressionTemplate
       return ConstLocalViewType( operand.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( operand.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return operand.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return operand.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      operand.waitForSynchronization();
+   }
+
 protected:
    const T1& operand;
 };
@@ -812,10 +924,19 @@ template< typename T1,
           typename Operation >
 std::ostream& operator<<( std::ostream& str, const DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression )
 {
+   const auto localRange = expression.getLocalRange();
    str << "[ ";
-   for( int i = 0; i < expression.getSize() - 1; i++ )
+   for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ )
       str << expression.getElement( i ) << ", ";
-   str << expression.getElement( expression.getSize() - 1 ) << " ]";
+   str << expression.getElement( localRange.getEnd() - 1 );
+   if( expression.getGhosts() > 0 ) {
+      str << " | ";
+      const auto localView = expression.getConstLocalViewWithGhosts();
+      for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ )
+         str << localView.getElement( i ) << ", ";
+      str << localView.getElement( localView.getSize() - 1 );
+   }
+   str << " ]";
    return str;
 }
 
@@ -823,10 +944,19 @@ template< typename T,
           typename Operation >
 std::ostream& operator<<( std::ostream& str, const DistributedUnaryExpressionTemplate< T, Operation >& expression )
 {
+   const auto localRange = expression.getLocalRange();
    str << "[ ";
-   for( int i = 0; i < expression.getSize() - 1; i++ )
+   for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ )
       str << expression.getElement( i ) << ", ";
-   str << expression.getElement( expression.getSize() - 1 ) << " ]";
+   str << expression.getElement( localRange.getEnd() - 1 );
+   if( expression.getGhosts() > 0 ) {
+      str << " | ";
+      const auto localView = expression.getConstLocalViewWithGhosts();
+      for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ )
+         str << localView.getElement( i ) << ", ";
+      str << localView.getElement( localView.getSize() - 1 );
+   }
+   str << " ]";
    return str;
 }
 
diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h
index f0b507475..75e958734 100644
--- a/src/TNL/Containers/Partitioner.h
+++ b/src/TNL/Containers/Partitioner.h
@@ -12,7 +12,10 @@
 
 #pragma once
 
+#include <vector>
+
 #include "Subrange.h"
+#include "ByteArraySynchronizer.h"
 
 #include <TNL/Math.h>
 
@@ -66,13 +69,64 @@ public:
       const Index end = min( globalSize, (rank + 1) * globalSize / partitions );
       return end - begin;
    }
-};
 
-// TODO:
-// - partitioner in deal.II stores also ghost indices:
-//   https://www.dealii.org/8.4.0/doxygen/deal.II/classUtilities_1_1MPI_1_1Partitioner.html
-// - ghost indices are stored in a general IndexMap class (based on collection of subranges):
-//   https://www.dealii.org/8.4.0/doxygen/deal.II/classIndexSet.html
+   template< typename Device >
+   class ArraySynchronizer
+   : public ByteArraySynchronizer< Device, Index >
+   {
+      using Base = ByteArraySynchronizer< Device, Index >;
+
+      SubrangeType localRange;
+      int overlaps;
+      CommunicationGroup group;
+
+   public:
+      using ByteArrayView = typename Base::ByteArrayView;
+
+      ArraySynchronizer() = delete;
+
+      ArraySynchronizer( SubrangeType localRange, int overlaps, CommunicationGroup group )
+      : localRange(localRange), overlaps(overlaps), group(group)
+      {}
+
+      virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override
+      {
+         TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps),
+                        "unexpected array size" );
+
+         const int rank = Communicator::GetRank( group );
+         const int nproc = Communicator::GetSize( group );
+         const int left = (rank > 0) ? rank - 1 : nproc - 1;
+         const int right = (rank < nproc - 1) ? rank + 1 : 0;
+
+         // buffer for asynchronous communication requests
+         std::vector< typename Communicator::Request > requests;
+
+         // issue all async receive operations
+         requests.push_back( Communicator::IRecv(
+                  array.getData() + bytesPerValue * localRange.getSize(),
+                  bytesPerValue * overlaps,
+                  left, 0, group ) );
+         requests.push_back( Communicator::IRecv(
+                  array.getData() + bytesPerValue * (localRange.getSize() + overlaps),
+                  bytesPerValue * overlaps,
+                  right, 0, group ) );
+
+         // issue all async send operations
+         requests.push_back( Communicator::ISend(
+                  array.getData(),
+                  bytesPerValue * overlaps,
+                  left, 0, group ) );
+         requests.push_back( Communicator::ISend(
+                  array.getData() + bytesPerValue * (localRange.getSize() - overlaps),
+                  bytesPerValue * overlaps,
+                  right, 0, group ) );
+
+         // wait for all communications to finish
+         Communicator::WaitAll( requests.data(), requests.size() );
+      }
+   };
+};
 
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index 40a675f1a..d42ce0ae7 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -177,7 +177,7 @@ DistributedMatrix< Matrix, Communicator >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
    if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
+      rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() );
       auto localRowLengths = rowLengths.getLocalView();
       localMatrix.getCompressedRowLengths( localRowLengths );
    }
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index 097a60d26..d201a0a09 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -13,6 +13,8 @@
 #include <TNL/Containers/DistributedArray.h>
 #include <TNL/Containers/Partitioner.h>
 
+#include "VectorHelperFunctions.h"
+
 using namespace TNL;
 using namespace TNL::Containers;
 
@@ -45,13 +47,20 @@ protected:
    const int rank = CommunicatorType::GetRank(group);
    const int nproc = CommunicatorType::GetSize(group);
 
+   // some arbitrary even value (but must be 0 if not distributed)
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
    DistributedArrayTest()
    {
       using LocalRangeType = typename DistributedArray::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      distributedArray.setDistribution( localRange, globalSize, group );
+      distributedArray.setDistribution( localRange, ghosts, globalSize, group );
+
+      using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >;
+      distributedArray.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
 
       EXPECT_EQ( distributedArray.getLocalRange(), localRange );
+      EXPECT_EQ( distributedArray.getGhosts(), ghosts );
       EXPECT_EQ( distributedArray.getCommunicationGroup(), group );
    }
 };
@@ -67,6 +76,14 @@ using DistributedArrayTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( DistributedArrayTest, DistributedArrayTypes );
 
+TYPED_TEST( DistributedArrayTest, checkLocalSizes )
+{
+   EXPECT_EQ( this->distributedArray.getLocalView().getSize(), this->distributedArray.getLocalRange().getSize() );
+   EXPECT_EQ( this->distributedArray.getConstLocalView().getSize(), this->distributedArray.getLocalRange().getSize() );
+   EXPECT_EQ( this->distributedArray.getLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts );
+   EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts );
+}
+
 TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes )
 {
    using CommunicatorType = typename TestFixture::CommunicatorType;
@@ -85,14 +102,25 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal )
 
    this->distributedArray.setValue( 0.0 );
    ArrayType globalArray( this->globalSize );
-   globalArray.setValue( 1.0 );
+   setLinearSequence( globalArray );
    this->distributedArray.copyFromGlobal( globalArray );
 
-   ArrayViewType localArrayView = this->distributedArray.getLocalView();
-   auto globalView = globalArray.getConstView();
    const auto localRange = this->distributedArray.getLocalRange();
-   globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getEnd() - localRange.getBegin() );
+   ArrayViewType localArrayView;
+   localArrayView.bind( this->distributedArray.getLocalView().getData(), localRange.getSize() );
+   auto globalView = globalArray.getConstView();
+   globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getSize() );
    EXPECT_EQ( localArrayView, globalView );
+
+   // check ghost values
+   for( int o = 0; o < this->ghosts / 2; o++ ) {
+      const int left_i = localRange.getSize() + o;
+      const int left_gi = ((this->rank > 0) ? localRange.getBegin() : this->globalSize) - this->ghosts / 2 + o;
+      EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( left_i ), globalArray.getElement( left_gi ) );
+      const int right_i = localRange.getSize() + this->ghosts / 2 + o;
+      const int right_gi = ((this->rank < this->nproc - 1) ? localRange.getEnd() : 0) + o;
+      EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( right_i ), globalArray.getElement( right_gi ) );
+   }
 }
 
 TYPED_TEST( DistributedArrayTest, setLike )
@@ -129,6 +157,27 @@ TYPED_TEST( DistributedArrayTest, setValue )
    EXPECT_EQ( localArrayView, expected );
 }
 
+TYPED_TEST( DistributedArrayTest, setValueGhosts )
+{
+   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using ArrayType = typename TestFixture::ArrayType;
+
+   this->distributedArray.setValue( this->rank );
+   ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts();
+   ArrayType expected( localArrayView.getSize() );
+   expected.setValue( this->rank );
+
+   // set expected ghost values
+   const int left = (this->rank > 0) ? this->rank - 1 : this->nproc - 1;
+   const int right = (this->rank < this->nproc - 1) ? this->rank + 1 : 0;
+   for( int o = 0; o < this->ghosts / 2; o++ ) {
+      expected.setElement( this->distributedArray.getLocalRange().getSize() + o, left );
+      expected.setElement( this->distributedArray.getLocalRange().getSize() + this->ghosts / 2 + o, right );
+   }
+
+   EXPECT_EQ( localArrayView, expected );
+}
+
 TYPED_TEST( DistributedArrayTest, elementwiseAccess )
 {
    using ArrayViewType = typename TestFixture::ArrayViewType;
@@ -139,7 +188,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    const auto localRange = this->distributedArray.getLocalRange();
 
    // check initial value
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), 0 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), 0 );
@@ -149,13 +198,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    }
 
    // use setValue
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i + 1 );
    }
 
    // check set value
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), i + 1 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 );
@@ -168,13 +217,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
 
    // use operator[]
    if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
-      for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+      for( IndexType i = 0; i < localRange.getSize(); i++ ) {
          const IndexType gi = localRange.getGlobalIndex( i );
          this->distributedArray[ gi ] = i + 1;
       }
 
       // check set value
-      for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+      for( IndexType i = 0; i < localRange.getSize(); i++ ) {
          const IndexType gi = localRange.getGlobalIndex( i );
          EXPECT_EQ( localArrayView.getElement( i ), i + 1 );
          EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 );
@@ -189,8 +238,9 @@ TYPED_TEST( DistributedArrayTest, copyConstructor )
 
    this->distributedArray.setValue( 1 );
    DistributedArrayType copy( this->distributedArray );
-   // Array has "binding" copy-constructor
-   //EXPECT_EQ( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() );
+   // no binding, but deep copy
+   EXPECT_NE( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() );
+   EXPECT_EQ( copy.getLocalView(), this->distributedArray.getLocalView() );
 }
 
 TYPED_TEST( DistributedArrayTest, copyAssignment )
@@ -216,7 +266,7 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators )
    v.setLike( u );
    w.setLike( u );
 
-   for( int i = 0; i < u.getLocalView().getSize(); i ++ ) {
+   for( int i = 0; i < localRange.getSize(); i ++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       u.setElement( gi, i );
       v.setElement( gi, i );
@@ -245,7 +295,7 @@ TYPED_TEST( DistributedArrayTest, containsValue )
 
    const auto localRange = this->distributedArray.getLocalRange();
 
-   for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i % 10 );
    }
@@ -263,7 +313,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue )
 
    const auto localRange = this->distributedArray.getLocalRange();
 
-   for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i % 10 );
    }
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index 1d727aef6..5a201980c 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -56,11 +56,21 @@ protected:
    // scan with multiple CUDA grids
    const int globalSize = 10000 * nproc;
 
+   // some arbitrary value (but must be 0 if not distributed)
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
    DistributedVectorTest()
    {
       using LocalRangeType = typename DistributedVector::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      v.setDistribution( localRange, globalSize, group );
+      v.setDistribution( localRange, ghosts, globalSize, group );
+
+      using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >;
+      using HostSynchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< Devices::Sequential >;
+      v.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
+      v_view.setSynchronizer( v.getSynchronizer() );
+      v_host.setSynchronizer( std::make_shared<HostSynchronizer>( localRange, ghosts / 2, group ) );
+
       v_view.bind( v );
       setConstantSequence( v, 1 );
    }
@@ -77,6 +87,8 @@ using DistributedVectorTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
 
+// TODO: test that horizontal operations are computed for ghost values without synchronization
+
 TYPED_TEST( DistributedVectorTest, scan )
 {
    using RealType = typename TestFixture::DistributedVectorType::RealType;
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index 7f81d87f5..b659beaea 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -66,6 +66,14 @@ protected:
                      "CommunicatorType must be the same for both Left and Right vectors." );
       using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >;
       using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >;
+
+      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+      const int rank = CommunicatorType::GetRank(group);
+      const int nproc = CommunicatorType::GetSize(group);
+
+      // some arbitrary value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using LeftVector = Vector< LeftReal, typename Left::DeviceType, typename Left::IndexType >;
       using RightVector = Vector< RightReal, typename Right::DeviceType, typename Right::IndexType >;
@@ -89,14 +97,20 @@ protected:
       R2 = 2;
 #else
    #ifdef DISTRIBUTED_VECTOR
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
       using LocalRangeType = typename LeftVector::LocalRangeType;
+      using Synchronizer = typename Partitioner< typename Left::IndexType, CommunicatorType >::template ArraySynchronizer< typename Left::DeviceType >;
       const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group );
 
-      _L1.setDistribution( localRange, size, group );
-      _L2.setDistribution( localRange, size, group );
-      _R1.setDistribution( localRange, size, group );
-      _R2.setDistribution( localRange, size, group );
+      _L1.setDistribution( localRange, ghosts, size, group );
+      _L2.setDistribution( localRange, ghosts, size, group );
+      _R1.setDistribution( localRange, ghosts, size, group );
+      _R2.setDistribution( localRange, ghosts, size, group );
+
+      auto synchronizer = std::make_shared<Synchronizer>( localRange, ghosts / 2, group );
+      _L1.setSynchronizer( synchronizer );
+      _L2.setSynchronizer( synchronizer );
+      _R1.setSynchronizer( synchronizer );
+      _R2.setSynchronizer( synchronizer );
    #else
       _L1.setSize( size );
       _L2.setSize( size );
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index 649de1cee..b7e8a1b95 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -9,15 +9,17 @@ void setLinearSequence( Vector& deviceVector )
 #ifdef STATIC_VECTOR
    Vector a;
 #else
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #endif
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = gi;
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = -1;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = i;
@@ -62,10 +64,12 @@ void setNegativeLinearSequence( Vector& deviceVector )
    HostVector a;
    a.setLike( deviceVector );
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = -gi;
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = 1;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = -i;
@@ -85,10 +89,12 @@ void setOscilatingSequence( Vector& deviceVector,
    a.setLike( deviceVector );
 #endif
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = v * std::pow( -1, gi );
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = 42;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = v * std::pow( -1, i );
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index 867adb069..27422513b 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -55,6 +55,14 @@ protected:
       using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
       template< typename Real >
       using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+
+      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+      const int rank = CommunicatorType::GetRank(group);
+      const int nproc = CommunicatorType::GetSize(group);
+
+      // some arbitrary even value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
@@ -167,13 +175,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       using VectorType = typename TestFixture::VectorType;     \
       using VectorOrView = typename TestFixture::VectorOrView; \
       using CommunicatorType = typename VectorOrView::CommunicatorType; \
-      const auto group = CommunicatorType::AllGroup; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       VectorType _V1, _V2;                                     \
-      _V1.setDistribution( localRange, size, group );          \
-      _V2.setDistribution( localRange, size, group );          \
+      _V1.setDistribution( localRange, this->ghosts, size, this->group ); \
+      _V2.setDistribution( localRange, this->ghosts, size, this->group ); \
+                                                               \
+      auto _synchronizer = std::make_shared<Synchronizer>( localRange, this->ghosts / 2, this->group ); \
+      _V1.setSynchronizer( _synchronizer );                    \
+      _V2.setSynchronizer( _synchronizer );                    \
                                                                \
       _V1 = 1;                                                 \
       _V2 = 2;                                                 \
@@ -188,14 +200,14 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \
       using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \
       using CommunicatorType = typename VectorOrView::CommunicatorType; \
-      const auto group = CommunicatorType::AllGroup; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       HostVector _V1h;                                         \
       HostExpectedVector expected_h;                           \
-      _V1h.setDistribution( localRange, size, group );         \
-      expected_h.setDistribution( localRange, size, group );   \
+      _V1h.setDistribution( localRange, this->ghosts, size, this->group ); \
+      expected_h.setDistribution( localRange, this->ghosts, size, this->group ); \
                                                                \
       const double h = (double) (end - begin) / size;          \
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \
@@ -204,10 +216,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
+      for( int i = localRange.getSize(); i < _V1h.getLocalView().getSize(); i++ ) \
+         _V1h.getLocalView()[ i ] = expected_h.getLocalView()[ i ] = 0;           \
                                                                \
       VectorType _V1; _V1 = _V1h;                              \
       VectorOrView V1( _V1 );                                  \
       ExpectedVector expected; expected = expected_h;          \
+                                                               \
+      auto _synchronizer = std::make_shared<Synchronizer>( localRange, this->ghosts / 2, this->group ); \
+      _V1.setSynchronizer( _synchronizer );                    \
+      expected.setSynchronizer( _synchronizer );               \
+      expected.startSynchronization();                         \
 
 #else
    #define SETUP_UNARY_VECTOR_TEST( size ) \
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index ac7fa79d6..4ad0c8303 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -56,6 +56,14 @@ protected:
       using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
       template< typename Real >
       using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+
+      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+      const int rank = CommunicatorType::GetRank(group);
+      const int nproc = CommunicatorType::GetSize(group);
+
+      // some arbitrary value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
@@ -75,11 +83,11 @@ protected:
       setLinearSequence( V1 );
 #else
    #ifdef DISTRIBUTED_VECTOR
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
       using LocalRangeType = typename VectorOrView::LocalRangeType;
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >;
       const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group );
-
-      _V1.setDistribution( localRange, size, group );
+      _V1.setDistribution( localRange, ghosts, size, group );
+      _V1.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
    #else
       _V1.setSize( size );
    #endif
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index ea5a7e582..4cc584672 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -89,7 +89,7 @@ protected:
       using LocalRangeType = typename DistributedMatrix::LocalRangeType;
       const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
       matrix.setDistribution( localRange, globalSize, globalSize, group );
-      rowCapacities.setDistribution( localRange, globalSize, group );
+      rowCapacities.setDistribution( localRange, 0, globalSize, group );
 
       EXPECT_EQ( matrix.getLocalRowRange(), localRange );
       EXPECT_EQ( matrix.getCommunicationGroup(), group );
@@ -215,7 +215,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
 
    GlobalVector inVector( this->globalSize );
    inVector.setValue( 1 );
-   DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    this->matrix.vectorProduct( inVector, outVector );
 
    EXPECT_EQ( outVector, this->rowCapacities )
@@ -230,9 +230,9 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
    this->matrix.setRowCapacities( this->rowCapacities );
    setMatrix( this->matrix, this->rowCapacities );
 
-   DistributedVector inVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector inVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    inVector.setValue( 1 );
-   DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    this->matrix.vectorProduct( inVector, outVector );
 
    EXPECT_EQ( outVector, this->rowCapacities )
-- 
GitLab


From 4f5444a5f2bacf1d589714222229f57329eb5991 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 18 Nov 2020 23:49:42 +0100
Subject: [PATCH 25/50] DistributedMatrix: implemented vectorProduct using
 ghost ranges

---
 src/TNL/Matrices/DistributedMatrix_impl.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index d42ce0ae7..b9638e002 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -285,7 +285,6 @@ DistributedMatrix< Matrix, Communicator >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector ) const
 {
-   TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" );
    TNL_ASSERT_EQ( inVector.getLocalRange(), getLocalRowRange(), "input vector has wrong distribution" );
    TNL_ASSERT_EQ( inVector.getCommunicationGroup(), getCommunicationGroup(), "input vector has wrong communication group" );
    TNL_ASSERT_EQ( outVector.getSize(), getRows(), "output vector has wrong size" );
@@ -295,7 +294,24 @@ vectorProduct( const InVector& inVector,
    if( getCommunicationGroup() == CommunicatorType::NullGroup )
       return;
 
-   const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() );
+   if( inVector.getGhosts() == 0 ) {
+      // NOTE: this branch is deprecated and kept only due to existing benchmarks
+      TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" );
+      const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() );
+   }
+   else {
+      TNL_ASSERT_EQ( inVector.getConstLocalViewWithGhosts().getSize(), localMatrix.getColumns(), "the matrix uses non-local and non-ghost column indices" );
+      TNL_ASSERT_EQ( inVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "input vector has wrong ghosts size" );
+      TNL_ASSERT_EQ( outVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "output vector has wrong ghosts size" );
+      TNL_ASSERT_EQ( outVector.getConstLocalView().getSize(), localMatrix.getRows(), "number of local matrix rows does not match the output vector local size" );
+
+      inVector.waitForSynchronization();
+      const auto inView = inVector.getConstLocalViewWithGhosts();
+      auto outView = outVector.getLocalView();
+      localMatrix.vectorProduct( inView, outView );
+      // TODO: synchronization is not always necessary, e.g. when a preconditioning step follows
+//      outVector.startSynchronization();
+   }
 }
 
 template< typename Matrix,
-- 
GitLab


From 364f03bf3078fe101cbf6b67d71a54ca2a8f3800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 20 Nov 2020 14:30:31 +0100
Subject: [PATCH 26/50] Fixed the diagonal preconditioner for ghost ranges

---
 .../Linear/Preconditioners/Diagonal_impl.h    | 51 ++++++++++---------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index f30151548..474a78f21 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -49,14 +49,7 @@ void
 Diagonal< Matrix >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
-   ConstVectorViewType diag_view( diagonal );
-
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      x[ i ] = b[ i ] / diag_view[ i ];
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   x = b / diagonal;
 }
 
 
@@ -66,23 +59,32 @@ Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >::
 update( const MatrixPointer& matrixPointer )
 {
    TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" );
-   TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" );
-
    diagonal.setSize( matrixPointer->getLocalMatrix().getRows() );
 
    LocalViewType diag_view( diagonal );
    // FIXME: SparseMatrix::getConstView is broken
 //   const auto matrix_view = matrixPointer->getLocalMatrix().getConstView();
    const auto matrix_view = matrixPointer->getLocalMatrix().getView();
-   const auto row_range = matrixPointer->getLocalRowRange();
 
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      const IndexType gi = row_range.getGlobalIndex( i );
-      diag_view[ i ] = matrix_view.getElement( i, gi );
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   if( matrixPointer->getRows() == matrixPointer->getColumns() ) {
+      // square matrix, assume global column indices
+      const auto row_range = matrixPointer->getLocalRowRange();
+      auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+      {
+         const IndexType gi = row_range.getGlobalIndex( i );
+         diag_view[ i ] = matrix_view.getElement( i, gi );
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   }
+   else {
+      // non-square matrix, assume ghost indexing
+      TNL_ASSERT_LT( matrixPointer->getLocalMatrix().getRows(), matrixPointer->getLocalMatrix().getColumns(), "the local matrix should have more columns than rows" );
+      auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+      {
+         diag_view[ i ] = matrix_view.getElement( i, i );
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   }
 }
 
 template< typename Matrix, typename Communicator >
@@ -94,15 +96,14 @@ solve( ConstVectorViewType b, VectorViewType x ) const
    const auto b_view = b.getConstLocalView();
    auto x_view = x.getLocalView();
 
-   TNL_ASSERT_EQ( b_view.getSize(), diagonal.getSize(), "The size of the vector b does not match the size of the extracted diagonal." );
-   TNL_ASSERT_EQ( x_view.getSize(), diagonal.getSize(), "The size of the vector x does not match the size of the extracted diagonal." );
+   // wait for pending synchronization
+   b.waitForSynchronization();
 
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      x_view[ i ] = b_view[ i ] / diag_view[ i ];
-   };
+   // compute without ghosts (diagonal includes only local rows)
+   x_view = b_view / diag_view;
 
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   // synchronize ghosts
+   x.startSynchronization();
 }
 
 } // namespace Preconditioners
-- 
GitLab


From c8815636058e85882b31967bda048284af8f6511 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 2 Dec 2020 17:00:13 +0100
Subject: [PATCH 27/50] Fixed hardcoded entity shapes in VTUWriter

---
 src/TNL/Meshes/Writers/VTUWriter.hpp | 41 +++++++++++++++++-----------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp
index 61872ffe1..c8093010d 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTUWriter.hpp
@@ -83,6 +83,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -94,7 +95,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
          connectivity.push_back( i );
          connectivity.push_back( i+1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -106,6 +107,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -116,7 +118,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -128,6 +130,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 2 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 2 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -142,7 +145,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -154,6 +157,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -161,21 +165,21 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
                      std::vector< std::uint8_t > & types )
    {
       for( MeshIndex j = 0; j < mesh.getDimensions().y(); j++ )
-      for( MeshIndex i = 0; i < ( mesh.getDimensions().x() + 1 ); i++ )
+      for( MeshIndex i = 0; i < (mesh.getDimensions().x() + 1); i++ )
       {
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
-      for( MeshIndex j = 0; j < (mesh.getDimensions().y()+1); j++ )
+      for( MeshIndex j = 0; j < (mesh.getDimensions().y() + 1); j++ )
       for( MeshIndex i = 0; i < mesh.getDimensions().x(); i++ )
       {
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -187,6 +191,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -198,7 +203,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( j * mesh.getDimensions().x() + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -210,6 +215,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 3 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -229,7 +235,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Voxel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -241,6 +247,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 2 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -256,7 +263,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ )
@@ -268,7 +275,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ )
@@ -280,7 +287,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -292,6 +299,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -305,7 +313,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ )
@@ -315,7 +323,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ )
@@ -325,7 +333,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -337,6 +345,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -349,7 +358,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
-- 
GitLab


From 99b346f6dee1fbb831eb315b36d556ab3dfc6084 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 2 Dec 2020 18:24:57 +0100
Subject: [PATCH 28/50] Added overload of getEntityMeasure for hexahedrons

---
 src/TNL/Meshes/Geometry/getEntityCenter.h  |  2 +-
 src/TNL/Meshes/Geometry/getEntityMeasure.h | 24 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h
index 6e869f6ec..addef6b9f 100644
--- a/src/TNL/Meshes/Geometry/getEntityCenter.h
+++ b/src/TNL/Meshes/Geometry/getEntityCenter.h
@@ -39,7 +39,7 @@ getEntityCenter( const Mesh< MeshConfig, Device > & mesh,
 /*
  * Get an arithmetic mean of the entity's subvertices.
  *
- * For an simplex entity this corresponds to the centroid of the entity, but
+ * For a simplex entity this corresponds to the centroid of the entity, but
  * note that other shapes such as general polygons have different formulas for
  * the centroid: https://en.wikipedia.org/wiki/Centroid#Centroid_of_a_polygon
  */
diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h
index 70d5614ce..fb1e2d468 100644
--- a/src/TNL/Meshes/Geometry/getEntityMeasure.h
+++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h
@@ -19,6 +19,7 @@
 #include <TNL/Meshes/Topologies/Triangle.h>
 #include <TNL/Meshes/Topologies/Quadrangle.h>
 #include <TNL/Meshes/Topologies/Tetrahedron.h>
+#include <TNL/Meshes/Topologies/Hexahedron.h>
 
 namespace TNL {
 namespace Meshes {
@@ -148,5 +149,28 @@ getEntityMeasure( const Mesh< MeshConfig, Device > & mesh,
     return getTetrahedronVolume( v3 - v0, v2 - v0, v1 - v0 );
 }
 
+template< typename MeshConfig, typename Device >
+__cuda_callable__
+typename MeshConfig::RealType
+getEntityMeasure( const Mesh< MeshConfig, Device > & mesh,
+                  const MeshEntity< MeshConfig, Device, Topologies::Hexahedron > & entity )
+{
+    const auto& v0 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 0 ) );
+    const auto& v1 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 1 ) );
+    const auto& v2 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 2 ) );
+    const auto& v3 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 3 ) );
+    const auto& v4 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 4 ) );
+    const auto& v5 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 5 ) );
+    const auto& v6 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 6 ) );
+    const auto& v7 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 7 ) );
+    // https://www.cfd-online.com/Forums/main/163122-volume-general-hexahedron.html#post574650
+    return getTetrahedronVolume( v0 - v4, v3 - v4, v1 - v4 )
+         + getTetrahedronVolume( v2 - v4, v3 - v4, v1 - v4 )
+         + getTetrahedronVolume( v1 - v4, v2 - v4, v5 - v4 )
+         + getTetrahedronVolume( v6 - v4, v2 - v4, v5 - v4 )
+         + getTetrahedronVolume( v3 - v4, v2 - v4, v7 - v4 )
+         + getTetrahedronVolume( v6 - v4, v2 - v4, v7 - v4 );
+}
+
 } // namespace Meshes
 } // namespace TNL
-- 
GitLab


From 45e6fc1a6ec563b949c51cff456804b4023d4eb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 2 Dec 2020 17:08:05 +0100
Subject: [PATCH 29/50] pytnl: added bindings for MeshOfQuadrangles and
 MeshOfHexahedrons

---
 src/Python/pytnl/tnl/Mesh.cpp                 |  2 ++
 src/Python/pytnl/tnl/MeshReaders.cpp          |  2 ++
 src/Python/pytnl/tnl/MeshWriters.cpp          |  4 +++
 src/Python/pytnl/tnl_mpi/DistributedMesh.cpp  |  2 ++
 .../pytnl/tnl_mpi/DistributedMeshReaders.cpp  |  2 ++
 .../pytnl/tnl_mpi/DistributedMeshWriters.cpp  |  2 ++
 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp          |  4 +++
 src/Python/pytnl/typedefs.h                   | 32 ++++++++-----------
 8 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp
index a3e582680..48e3f939b 100644
--- a/src/Python/pytnl/tnl/Mesh.cpp
+++ b/src/Python/pytnl/tnl/Mesh.cpp
@@ -7,5 +7,7 @@ void export_Meshes( py::module & m )
 {
     export_Mesh< MeshOfEdges >( m, "MeshOfEdges" );
     export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" );
+    export_Mesh< MeshOfQuadrangles >( m, "MeshOfQuadrangles" );
     export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" );
+    export_Mesh< MeshOfHexahedrons >( m, "MeshOfHexahedrons" );
 }
diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp
index d47ec5268..c4abae015 100644
--- a/src/Python/pytnl/tnl/MeshReaders.cpp
+++ b/src/Python/pytnl/tnl/MeshReaders.cpp
@@ -17,7 +17,9 @@ void export_MeshReaders( py::module & m )
         .def("detectMesh", &MeshReader::detectMesh)
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >)
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfQuadrangles >)
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfHexahedrons >)
     ;
 
     py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" )
diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp
index 17c3c7492..78eca5e05 100644
--- a/src/Python/pytnl/tnl/MeshWriters.cpp
+++ b/src/Python/pytnl/tnl/MeshWriters.cpp
@@ -83,6 +83,10 @@ void export_MeshWriters( py::module & m )
     export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" );
     export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTriangles" );
     export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfQuadrangles >,  TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfQuadrangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfQuadrangles >,  TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfQuadrangles" );
     export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTetrahedrons" );
     export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfHexahedrons >,  TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfHexahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfHexahedrons >,  TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfHexahedrons" );
 }
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
index 03ee3692e..0af175f3c 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
@@ -12,7 +12,9 @@ void export_DistributedMeshes( py::module & m )
 
     export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" );
     export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" );
+    export_DistributedMesh< DistributedMeshOfQuadrangles >( m, "DistributedMeshOfQuadrangles" );
     export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" );
+    export_DistributedMesh< DistributedMeshOfHexahedrons >( m, "DistributedMeshOfHexahedrons" );
 
     // export VTKTypesArrayType
     using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType;
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
index e972eb65e..c196a67cc 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
@@ -19,6 +19,8 @@ void export_DistributedMeshReaders( py::module & m )
         // loadMesh is not virtual in PVTUReader
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >)
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfQuadrangles >)
         .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfHexahedrons >)
     ;
 }
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
index 4d1d18bae..089d59adf 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
@@ -90,5 +90,7 @@ void export_DistributedMeshWriters( py::module & m )
     constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed;
     export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges,        default_format >( m, "PVTUWriter_MeshOfEdges" );
     export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles,    default_format >( m, "PVTUWriter_MeshOfTriangles" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfQuadrangles,    default_format >( m, "PVTUWriter_MeshOfQuadrangles" );
     export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfHexahedrons, default_format >( m, "PVTUWriter_MeshOfHexahedrons" );
 }
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
index be7813959..a12060600 100644
--- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -39,6 +39,10 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m)
     using TNL::Meshes::DistributedMeshes::distributeSubentities;
     m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) {
           distributeSubentities< 1 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfQuadrangles& mesh ) {
+          distributeSubentities< 1 >( mesh ); });
     m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) {
           distributeSubentities< 2 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfHexahedrons& mesh ) {
+          distributeSubentities< 2 >( mesh ); });
 }
diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h
index ac4b6bd83..7bc9fe025 100644
--- a/src/Python/pytnl/typedefs.h
+++ b/src/Python/pytnl/typedefs.h
@@ -20,7 +20,9 @@
 #include <TNL/Meshes/DefaultConfig.h>
 #include <TNL/Meshes/Topologies/Edge.h>
 #include <TNL/Meshes/Topologies/Triangle.h>
+#include <TNL/Meshes/Topologies/Quadrangle.h>
 #include <TNL/Meshes/Topologies/Tetrahedron.h>
+#include <TNL/Meshes/Topologies/Hexahedron.h>
 
 using RealType = double;
 using DeviceType = TNL::Devices::Host;
@@ -31,28 +33,22 @@ using Grid2D = TNL::Meshes::Grid<2, RealType, DeviceType, IndexType>;
 using Grid3D = TNL::Meshes::Grid<3, RealType, DeviceType, IndexType>;
 
 using LocalIndexType = short int;
-using EdgeTopology = TNL::Meshes::Topologies::Edge;
-using TriangleTopology = TNL::Meshes::Topologies::Triangle;
-using TetrahedronTopology = TNL::Meshes::Topologies::Tetrahedron;
-using MeshOfEdges = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            EdgeTopology,
-                            EdgeTopology::dimension,
-                            RealType,
-                            IndexType,
-                            LocalIndexType > >;
-using MeshOfTriangles = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            TriangleTopology,
-                            TriangleTopology::dimension,
-                            RealType,
-                            IndexType,
-                            LocalIndexType > >;
-using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            TetrahedronTopology,
-                            TetrahedronTopology::dimension,
+template< typename Topology >
+using DefaultMeshTemplate = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
+                            Topology,
+                            Topology::dimension,
                             RealType,
                             IndexType,
                             LocalIndexType > >;
 
+using MeshOfEdges = DefaultMeshTemplate< TNL::Meshes::Topologies::Edge >;
+using MeshOfTriangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Triangle >;
+using MeshOfQuadrangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Quadrangle >;
+using MeshOfTetrahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Tetrahedron >;
+using MeshOfHexahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Hexahedron >;
+
 using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >;
 using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >;
+using DistributedMeshOfQuadrangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfQuadrangles >;
 using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >;
+using DistributedMeshOfHexahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfHexahedrons >;
-- 
GitLab


From 966d73877cfec43065f55a112a0762e846f02c58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 5 Dec 2020 20:54:10 +0100
Subject: [PATCH 30/50] pytnl: for completeness, export bindings for mesh
 writers for grids

---
 src/Python/pytnl/tnl/MeshWriters.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp
index 78eca5e05..01f79ce2d 100644
--- a/src/Python/pytnl/tnl/MeshWriters.cpp
+++ b/src/Python/pytnl/tnl/MeshWriters.cpp
@@ -79,6 +79,13 @@ void export_MeshWriter( py::module & m, const char* name )
 
 void export_MeshWriters( py::module & m )
 {
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid1D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid1D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid2D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid2D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid3D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid3D" );
+
     export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfEdges" );
     export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" );
     export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTriangles" );
-- 
GitLab


From 863a4f698e894146739e957e16c8a99f24a5906c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 20 Dec 2020 17:00:20 +0100
Subject: [PATCH 31/50] getOutwardNormalVector: added overloads for 2D and 3D
 unstructured meshes

---
 .../Meshes/Geometry/getOutwardNormalVector.h  | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
index 536800862..d3fa6ea50 100644
--- a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
+++ b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
+#include <TNL/Meshes/Topologies/Edge.h>
 
 namespace TNL {
 namespace Meshes {
@@ -87,5 +88,63 @@ getOutwardNormalVector( const Grid & grid,
    }
 }
 
+template< typename MeshConfig, typename Device >
+__cuda_callable__
+typename MeshTraits< MeshConfig >::PointType
+getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh,
+                        const MeshEntity< MeshConfig, Device, Topologies::Edge > & face,
+                        typename MeshTraits< MeshConfig >::PointType cellCenter )
+{
+   using MeshType = Mesh< MeshConfig, Device >;
+   using FaceType = MeshEntity< MeshConfig, Device, Topologies::Edge >;
+   using PointType = typename MeshTraits< MeshConfig >::PointType;
+   static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" );
+   static_assert( MeshConfig::worldDimension == 2, "TODO: normal vectors for 2D meshes in a 3D space are not implemented yet" );
+
+   const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) );
+   const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) );
+   const PointType u = v0 - v1;
+   const PointType n {u[1], -u[0]};
+
+   // check on which side of the face is the reference cell center
+   const PointType faceCenter = getEntityCenter( mesh, face );
+   if( dot( n, cellCenter - faceCenter ) < 0 )
+      return n / l2Norm( n );
+   else
+      return - n / l2Norm( n );
+}
+
+template< typename MeshConfig, typename Device, typename EntityTopology >
+__cuda_callable__
+typename MeshTraits< MeshConfig >::PointType
+getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh,
+                        const MeshEntity< MeshConfig, Device, EntityTopology > & face,
+                        typename MeshTraits< MeshConfig >::PointType cellCenter )
+{
+   using MeshType = Mesh< MeshConfig, Device >;
+   using FaceType = MeshEntity< MeshConfig, Device, EntityTopology >;
+   using PointType = typename MeshTraits< MeshConfig >::PointType;
+   static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" );
+   static_assert( MeshConfig::worldDimension == 3, "general overload intended for 3D was called with the wrong world dimension" );
+
+   const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) );
+   const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) );
+   const auto& v2 = mesh.getPoint( face.template getSubentityIndex< 0 >( 2 ) );
+   const PointType u1 = v0 - v1;
+   const PointType u2 = v0 - v2;
+   const PointType n {
+      u1.y() * u2.z() - u1.z() * u2.y(),   // first component of the cross product
+      u1.z() * u2.x() - u1.x() * u2.z(),   // second component of the cross product
+      u1.x() * u2.y() - u1.y() * u2.x()    // third component of the cross product
+   };
+
+   // check on which side of the face is the reference cell center
+   const PointType faceCenter = getEntityCenter( mesh, face );
+   if( dot( n, cellCenter - faceCenter ) < 0 )
+      return n / l2Norm( n );
+   else
+      return - n / l2Norm( n );
+}
+
 } // namespace Meshes
 } // namespace TNL
-- 
GitLab


From 65e1cc9e42c05068d3fcc5080f1e38451f384d48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 26 Dec 2020 14:46:48 +0100
Subject: [PATCH 32/50] Fixed ILU preconditioners for distributed matrices

---
 src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h | 3 ---
 src/TNL/Solvers/Linear/Preconditioners/ILU0.h          | 7 ++++++-
 src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h     | 3 +++
 src/TNL/Solvers/Linear/Preconditioners/ILUT.h          | 7 ++++++-
 src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h     | 3 +++
 src/TNL/Solvers/Linear/Traits.h                        | 4 ++++
 6 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index 474a78f21..d2227e57b 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -96,9 +96,6 @@ solve( ConstVectorViewType b, VectorViewType x ) const
    const auto b_view = b.getConstLocalView();
    auto x_view = x.getLocalView();
 
-   // wait for pending synchronization
-   b.waitForSynchronization();
-
    // compute without ghosts (diagonal includes only local rows)
    x_view = b_view / diag_view;
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index c4b409bb3..857d8a063 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -90,7 +90,12 @@ protected:
    template< typename M >
    static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m )
    {
-      return m.getLocalRowRange().getBegin();
+      if( m.getRows() == m.getColumns() )
+         // square matrix, assume global column indices
+         return m.getLocalRowRange().getBegin();
+      else
+         // non-square matrix, assume ghost indexing
+         return 0;
    }
 };
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index c11909c07..f68a93f16 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -145,6 +145,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const
 
    // Step 2: solve x from Ux = y
    triangularSolveUpper< true, true >( U, x, x );
+
+   // synchronize ghosts
+   Traits< Matrix >::startSynchronization( _x );
 }
 
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index d46f3f900..344daf1a0 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -79,7 +79,12 @@ protected:
    template< typename M >
    static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m )
    {
-      return m.getLocalRowRange().getBegin();
+      if( m.getRows() == m.getColumns() )
+         // square matrix, assume global column indices
+         return m.getLocalRowRange().getBegin();
+      else
+         // non-square matrix, assume ghost indexing
+         return 0;
    }
 };
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index c9c2a0b77..21b895c48 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -272,6 +272,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const
 
    // Step 2: solve x from Ux = y
    triangularSolveUpper< true, false >( U, x, x );
+
+   // synchronize ghosts
+   Traits< Matrix >::startSynchronization( _x );
 }
 
 } // namespace Preconditioners
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 5f93e0cde..83313ed98 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -52,6 +52,8 @@ struct Traits
    static LocalViewType getLocalView( VectorViewType v ) { return v; }
 
    static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; }
+   static void startSynchronization( VectorViewType v ) {}
+   static void waitForSynchronization( VectorViewType v ) {}
 };
 
 template< typename Matrix, typename Communicator >
@@ -95,6 +97,8 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
    static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); }
 
    static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
+   static void startSynchronization( VectorViewType v ) { v.startSynchronization(); }
+   static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); }
 };
 
 } // namespace Linear
-- 
GitLab


From b9d087074e03d8ff2cac6cddab5b7667f8b31aad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 27 Dec 2020 18:29:59 +0100
Subject: [PATCH 33/50] Use MPI_Init_thread instead of MPI_Init

This allows the user to set the required thread level. Initializing MPI
with threading support is needed when MPI functions are called from
multiple threads. All common MPI libraries seem to provide this feature.
---
 src/TNL/Communicators/MpiCommunicator.h | 36 +++++++++++++++++++++++--
 src/TNL/Communicators/MpiDefs.h         |  8 ++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 18143cce0..dedc35f03 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -142,10 +142,42 @@ class MpiCommunicator
          return true;
       }
 
-      static void Init(int& argc, char**& argv )
+      static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
       {
 #ifdef HAVE_MPI
-         MPI_Init( &argc, &argv );
+         switch( required_thread_level ) {
+            case MPI_THREAD_SINGLE:
+            case MPI_THREAD_FUNNELED:
+            case MPI_THREAD_SERIALIZED:
+            case MPI_THREAD_MULTIPLE:
+               break;
+            default:
+               printf("ERROR: invalid argument for the 'required' thread level support: %d\n", required_thread_level);
+               MPI_Abort(MPI_COMM_WORLD, 1);
+         }
+
+         int provided;
+         MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
+         if( provided < required_thread_level ) {
+            const char* level = "";
+            switch( required_thread_level ) {
+               case MPI_THREAD_SINGLE:
+                  level = "MPI_THREAD_SINGLE";
+                  break;
+               case MPI_THREAD_FUNNELED:
+                  level = "MPI_THREAD_FUNNELED";
+                  break;
+               case MPI_THREAD_SERIALIZED:
+                  level = "MPI_THREAD_SERIALIZED";
+                  break;
+               case MPI_THREAD_MULTIPLE:
+                  level = "MPI_THREAD_MULTIPLE";
+                  break;
+            }
+            printf("ERROR: The MPI library does not have the required level of thread support: %s\n", level);
+            MPI_Abort(MPI_COMM_WORLD, 1);
+         }
+
          selectGPU();
 #endif
 
diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/Communicators/MpiDefs.h
index 957354b9d..df43005ec 100644
--- a/src/TNL/Communicators/MpiDefs.h
+++ b/src/TNL/Communicators/MpiDefs.h
@@ -25,4 +25,12 @@ enum MPI_Op {
    MPI_MINLOC,
    MPI_MAXLOC,
 };
+
+// MPI_Init_thread constants
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
 #endif
-- 
GitLab


From cff4ab335edf759275959b97f38d83645f5ca6d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 27 Dec 2020 18:45:50 +0100
Subject: [PATCH 34/50] Implemented asynchronous operations for
 ByteArraySynchronizer

---
 CMakeLists.txt                                |   2 +-
 src/3rdparty/async/README.md                  | 532 ++++++++++++++++++
 src/3rdparty/async/bounded_queue.h            | 342 +++++++++++
 src/3rdparty/async/queue.h                    | 429 ++++++++++++++
 src/3rdparty/async/threadpool.h               | 192 +++++++
 src/3rdparty/async/utility.h                  |  66 +++
 src/TNL/Containers/ByteArraySynchronizer.h    | 117 +++-
 src/TNL/Containers/DistributedArray.h         |   2 +
 src/TNL/Containers/DistributedArray.hpp       |  12 +
 src/TNL/Containers/DistributedArrayView.h     |   4 +
 src/TNL/Containers/DistributedArrayView.hpp   |  27 +-
 src/TNL/Containers/Partitioner.h              |  19 +-
 .../DistributedMeshSynchronizer.h             |  42 +-
 .../Containers/DistributedArrayTest.h         |   6 +
 14 files changed, 1758 insertions(+), 34 deletions(-)
 create mode 100644 src/3rdparty/async/README.md
 create mode 100644 src/3rdparty/async/bounded_queue.h
 create mode 100644 src/3rdparty/async/queue.h
 create mode 100644 src/3rdparty/async/threadpool.h
 create mode 100644 src/3rdparty/async/utility.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05a0fd0b6..b85842c1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,7 @@ if( ${WITH_CUDA} )
                set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} )
             endif()
         endif()
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda --default-stream per-thread)
         # disable false compiler warnings
         #   reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937
         #   incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
diff --git a/src/3rdparty/async/README.md b/src/3rdparty/async/README.md
new file mode 100644
index 000000000..36106864a
--- /dev/null
+++ b/src/3rdparty/async/README.md
@@ -0,0 +1,532 @@
+# async
+Homepage: https://github.com/d36u9/async
+
+[[License(Boost Software License - Version 1.0)](http://www.boost.org/LICENSE_1_0.txt)]
+
+## Welcome
+async is a tiny C++ header-only high-performance library for async calls handled by a thread-pool, which is built on top of an unbounded MPMC lock-free queue.
+It's written in pure C++14 (C++11 support with preprocessor macros), no dependencies on other 3rd party libraries.
+
+Note: This library is originally designed for 64bit system. It has been tested on arch X86-64 and ARMV8(64bit), and ARMV7(32bit).
+
+## change logs
+* Jun. 2018:
+  * Added support for ARMV7 & V8
+  * Tested on Raspberry Pi 3 B+ with Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64)
+  * Tested on Raspberry Pi 3 B+ with Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l)
+  * Added Benchmark Results for Raspberry Pi 3 B+ ARMV8 (Linux Pi64 4.14.44-V8 AArch64)
+  * Added Benchmark Results for Raspberry Pi 3 B+ ARMV7 32bit (Linux 4.14.34-v7 armv7l)
+* Sept. 2017:
+  * Significantly improved the performance of async::queue without bulk operations.
+  * async::threadpool also benifits from this change.
+  * A bounded MPMC queue `async::bounded_queue` was added to the lib, which is pretty useful for memory constrainted system or some fixed-size message pipeline design. The overall performance of this buffer based `async::bounded_queue` is comparable to bulk operations of node-based `async::queue`. `async::bounded_queue` shares the almost identical interface as `async::queue`, except for bulk operations, and a size prarameter has to be passed to `bounded_queue`'s constructor, and also added blocking methods (`blocking_enqueue` & `blocking_dequeue`). `TRAIT::NOEXCEPT_CHECK` setting is also similar to `async::queue` to help handle exceptions that may be thrown in element's ctor.  `bounded_queue` is basically a C++ implementation of [PTLQueue](https://blogs.oracle.com/dave/ptlqueue-:-a-scalable-bounded-capacity-mpmc-queue) design (Please read Dave Dice's article for details and references).
+
+## Features
+* interchangeable with std::async, accepts all kinds of callable instances, like static functions, member functions, functors, lambdas
+* dynamically changeable thread-pool size at run-time
+* tasks are managed in a lock-free queue
+* provided lock-free queue doesn't have restricted limitation as boost::lockfree::queue
+* low-latency for the task execution thanks to underlying lock-free queue
+
+## Tested Platforms& Compilers
+(old versions of OSs or compilers may work, but not tested)
+* Windows 10 Visual Studio 2015+
+* Linux Ubuntu 16.04 gcc4.9.2+/clang 3.8+
+* MacOS Sierra 10.12.5 clang-802.0.42
+
+## Getting Started
+## Building the test& benchmark
+
+### C++11 compilers
+If your compiler only supports C++11, please edit CMakeLists.txt with the following change:
+```
+set(CMAKE_CXX_STANDARD 14)
+#change to
+set(CMAKE_CXX_STANDARD 11)
+```
+
+### Build& test with Microsoft C++ REST SDK
+If your OS is Windows or has cppresetsdk installed& configured on Linux or Mac, please edit CMakeLists.txt to enable PPL test:
+```
+option(WITH_CPPRESTSDK "Build Cpprestsdk Test" OFF)
+#to
+option(WITH_CPPRESTSDK "Build Cpprestsdk Test" ON)
+```
+
+
+### Build for Linux or Mac (x86-64 & ARMV7&V8)
+```
+#to use clang (linux) with following export command
+#EXPORT CC=clang-3.8
+#EXPORT CXX=clang++-3.8
+#run the following to set up release build, (for MasOS Xcode, you can remove -DCMAKE_BUILD_TYPE for now, and choose build type at build-time)
+cmake -H. -Bbuild -DCMAKE_BUILD_TYPE=RELEASE
+#now build the release
+cmake --build build --config Release
+#or debug
+cmake --build build --config Debug
+#or other builds
+cmake --build build --config RelWithDebInfo
+cmake --build build --config MinSizeRel
+```
+
+### Build for Windows (X86-64)
+```
+#for VS 2015
+cmake -H. -Bbuild -G "Visual Studio 14 2015 Win64"
+#or VS 2017
+cmake -H. -Bbuild -G "Visual Studio 15 2017 Win64"
+#build the release from command line or you can open the project file in Visual Studio, and build from there
+cmake --build build --config Release
+```
+
+## How to use it in your project/application
+simply copy all headers in async sub-folder to your project, and include those headers in your source code.
+
+## Thread Pool Indrodction
+### Thread Pool intializations
+
+```
+async::threadpool tp; //by default, thread pool size will be the same number of your hardware CPU core/threads
+async::threadpool tp(8); //create a thread pool with 8 threads
+async::threadpool tp(0); //create a thread pool with no threads available, it's in pause mode
+```
+
+### resize the thread pool
+```
+async::threadpool tp(32);
+...//some operations
+tp.configurepool(16);// can be called at anytime (as long as tp is still valid) to reset the pool size
+                     // no interurption for running tasks
+```
+### submit the task
+*static functions, member functions, functors, lambdas are all supported
+```
+int foo(int i) { return ++i; }
+auto pkg = tp.post(foo, i); //retuns a std::future
+pkg.get(); //will block
+```
+
+## multi-producer multi-consumer unbounded lock-free queue Indrodction
+The design: A simple and classic implementation. It's link-based 3-level depth nested container with local array for each level storage and simulated tagged pointer for linking.
+The size of each level, and tag bits can be configured through TRAITS (please see source for details).
+The queue with default traits seetings can store up to 1 Trillion elements/nodes (at least 1 Terabyte memory space).
+
+### element type requirements
+* nothrow destructible
+* optional (better to be true)
+  * nothrow constructible
+  * nothrow move-assignable
+
+NOTE: the exception thrown by constructor is acceptable. Although it'd be better to keep ctor noexcept if possible.
+noexcept detection is turned off by default, it can be turned on by setting  `TRAIT::NOEXCEPT_CHECK` to true.
+With `TRAIT::NOEXCEPT_CHECK` on(true), queue will enable exception handling if ctor or move assignment may throw exceptions.
+
+
+### queue intializations
+```
+async::queue<T> q; //default constructor, it's unbounded
+
+async::queue<T> q(1000); // pre-allocated 1000 storage nodes, the capcity will increase automatically after 1000 nodes are used
+```
+### usage
+```
+// enqueues a T constructed from args, supports the following constructions:
+// move, if args is a T rvalue
+// copy, if args is a T lvalue, or
+// emplacement if args is an initializer list that can be passed to a T constructor
+async::queue<T>::enqueue(Args... args)
+
+async::queue<T>::dequeue(T& data) //type T should have move assignment operator,
+//e.g.
+async::queue<int> q;
+q.enqueue(11);
+int i(0);
+q.dequeue(i);
+
+```
+### bulk operations
+It's convienent for bulk data, and also can boost the throughput.
+exception handling is not available in bulk operations even with `TRAIT::NOEXCEPT_CHECK` being true.
+bulk operations are suitable for plain data types, like network/event messages.
+
+```
+int a[] = {1,2,3,4,5};
+int b[5];
+q.bulk_enqueue(std::bengin(a), 5);
+auto popcount = q.bulk_dequeue(std::begin(b), 5); //popcount is the number of elemtnets sucessfully pulled from the queue.
+//or like the following code:
+std::vector<int> v;
+auto it = std::inserter(v, std::begin(v));
+popcount = q.bulk_dequeue(it, 5);
+```
+
+## Unit Test
+The unit test code provides most samples for usage.
+
+## Benchmark
+NOTE: the results may vary on different OS platforms and hardware.
+### thread pool benchmark
+The benchmark is a simple demonstration.
+NOTE: may require extra config, please see CMakeLists.txt for detailed settings
+The test benchamarks the following task/job based async implementation:
+* async::threadpool (this library)
+* std::async
+* boost::async
+* AsioThreadPool (my another implementation based on boost::asio, has very stable and good performance, especially on Windows with iocp)
+* Microsoft::PPL (pplx from [cpprestsdk](https://github.com/Microsoft/cpprestsdk) on Linux& MacOS or PPL on windows)
+
+
+e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64)
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 1130 ns  max: 1227 ns  min: 1066 ns avg_task_post: 1032 ns
+       *std::async (time/task) avg: 1469 ns  max: 1549 ns  min: 1423 ns avg_task_post: 1250 ns
+   *Microsoft::PPL (time/task) avg: 1148 ns  max: 1216 ns  min: 1114 ns avg_task_post: 1088 ns
+    AsioThreadPool (time/task) avg: 1166 ns  max: 1319 ns  min: 1013 ns avg_task_post: 1073 ns
+     *boost::async (time/task) avg: 29153 ns  max: 30028 ns  min: 27990 ns avg_task_post: 23343 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 439 ns  max: 557 ns  min: 398 ns avg_task_post: 356 ns
+       *std::async (time/task) avg: 800 ns  max: 890 ns  min: 759 ns avg_task_post: 629 ns
+   *Microsoft::PPL (time/task) avg: 666 ns  max: 701 ns  min: 640 ns avg_task_post: 605 ns
+    AsioThreadPool (time/task) avg: 448 ns  max: 541 ns  min: 389 ns avg_task_post: 365 ns
+     *boost::async (time/task) avg: 32419 ns  max: 33296 ns  min: 30105 ns avg_task_post: 25561 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 262 ns  max: 300 ns  min: 252 ns avg_task_post: 176 ns
+       *std::async (time/task) avg: 873 ns  max: 961 ns  min: 821 ns avg_task_post: 701 ns
+   *Microsoft::PPL (time/task) avg: 727 ns  max: 755 ns  min: 637 ns avg_task_post: 662 ns
+    AsioThreadPool (time/task) avg: 607 ns  max: 645 ns  min: 567 ns avg_task_post: 210 ns
+     *boost::async (time/task) avg: 33158 ns  max: 150331 ns  min: 28560 ns avg_task_post: 28655 ns
+```
+
+e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 1320 ns  max: 1357 ns  min: 1301 ns avg_task_post: 1266 ns
+       *std::async (time/task) avg: 11817 ns  max: 12469 ns  min: 11533 ns avg_task_post: 9580 ns
+   *Microsoft::PPL (time/task) avg: 1368 ns  max: 1498 ns  min: 1325 ns avg_task_post: 1349 ns
+    AsioThreadPool (time/task) avg: 1475 ns  max: 1499 ns  min: 1318 ns avg_task_post: 1332 ns
+     *boost::async (time/task) avg: 4574 ns  max: 4697 ns  min: 4450 ns avg_task_post: 4531 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 516 ns  max: 688 ns  min: 239 ns avg_task_post: 522 ns
+       *std::async (time/task) avg: 41630 ns  max: 44316 ns  min: 41334 ns avg_task_post: 38151 ns
+   *Microsoft::PPL (time/task) avg: 3652 ns  max: 3710 ns  min: 3598 ns avg_task_post: 3629 ns
+    AsioThreadPool (time/task) avg: 529 ns  max: 814 ns  min: 494 ns avg_task_post: 447 ns
+     *boost::async (time/task) avg: 14634 ns  max: 14669 ns  min: 14598 ns avg_task_post: 14583 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 398 ns  max: 468 ns  min: 337 ns avg_task_post: 177 ns
+       *std::async (time/task) avg: 44603 ns  max: 46904 ns  min: 44272 ns avg_task_post: 40877 ns
+   *Microsoft::PPL (time/task) avg: 3714 ns  max: 3816 ns  min: 3656 ns avg_task_post: 3690 ns
+    AsioThreadPool (time/task) avg: 564 ns  max: 605 ns  min: 533 ns avg_task_post: 253 ns
+     *boost::async (time/task) avg: 20421 ns  max: 21738 ns  min: 19105 ns avg_task_post: 20375 ns
+```
+
+e.g. MacOS 10.12.5 clang Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 (Microsoft::PPL(cpprestsdk::pplx) is superisingly good compared with other libraries on MacOS, not sure if it's due to some comipiler optimization)
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 8517 ns  max: 8641 ns  min: 7400 ns avg_task_post: 8393 ns
+       *std::async (time/task) avg: 13618 ns  max: 13845 ns  min: 13276 ns avg_task_post: 13476 ns
+   *Microsoft::PPL (time/task) avg: 747 ns  max: 938 ns  min: 626 ns avg_task_post: 718 ns
+    AsioThreadPool (time/task) avg: 8647 ns  max: 8807 ns  min: 8558 ns avg_task_post: 8524 ns
+     *boost::async (time/task) avg: 11732 ns  max: 12028 ns  min: 11526 ns avg_task_post: 11698 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 5964 ns  max: 6017 ns  min: 5790 ns avg_task_post: 5830 ns
+       *std::async (time/task) avg: 9690 ns  max: 10043 ns  min: 9132 ns avg_task_post: 9531 ns
+   *Microsoft::PPL (time/task) avg: 380 ns  max: 425 ns  min: 342 ns avg_task_post: 353 ns
+    AsioThreadPool (time/task) avg: 6173 ns  max: 6459 ns  min: 6116 ns avg_task_post: 6042 ns
+     *boost::async (time/task) avg: 8643 ns  max: 9470 ns  min: 8513 ns avg_task_post: 8591 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 3469 ns  max: 3527 ns  min: 3415 ns avg_task_post: 3339 ns
+       *std::async (time/task) avg: 10902 ns  max: 11164 ns  min: 10709 ns avg_task_post: 10738 ns
+   *Microsoft::PPL (time/task) avg: 367 ns  max: 426 ns  min: 326 ns avg_task_post: 323 ns
+    AsioThreadPool (time/task) avg: 3920 ns  max: 3975 ns  min: 3832 ns avg_task_post: 3409 ns
+     *boost::async (time/task) avg: 9800 ns  max: 10223 ns  min: 9196 ns avg_task_post: 9744 ns
+```
+
+e.g. Windows 7 64bit Intel i7-4790 16GB RAM Visual Studio 2015 Update 3
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 809 ns  max: 924 ns  min: 687 ns avg_task_post: 774 ns
+       *std::async (time/task) avg: 1914 ns  max: 2032 ns  min: 1790 ns avg_task_post: 1877 ns
+   *Microsoft::PPL (time/task) avg: 1718 ns  max: 2181 ns  min: 1623 ns avg_task_post: 1677 ns
+    AsioThreadPool (time/task) avg: 1100 ns  max: 1137 ns  min: 1076 ns avg_task_post: 1065 ns
+     *boost::async (time/task) avg: 191532 ns  max: 203716 ns  min: 186114 ns avg_task_post: 191507 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 423 ns  max: 538 ns  min: 338 ns avg_task_post: 388 ns
+       *std::async (time/task) avg: 1249 ns  max: 1279 ns  min: 1233 ns avg_task_post: 1211 ns
+   *Microsoft::PPL (time/task) avg: 1229 ns  max: 1246 ns  min: 1208 ns avg_task_post: 1186 ns
+    AsioThreadPool (time/task) avg: 563 ns  max: 577 ns  min: 499 ns avg_task_post: 528 ns
+     *boost::async (time/task) avg: 95484 ns  max: 112569 ns  min: 93808 ns avg_task_post: 95458 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 267 ns  max: 323 ns  min: 255 ns avg_task_post: 232 ns
+       *std::async (time/task) avg: 1202 ns  max: 1257 ns  min: 1182 ns avg_task_post: 1009 ns
+   *Microsoft::PPL (time/task) avg: 1199 ns  max: 1262 ns  min: 1175 ns avg_task_post: 988 ns
+    AsioThreadPool (time/task) avg: 783 ns  max: 960 ns  min: 706 ns avg_task_post: 375 ns
+     *boost::async (time/task) avg: 103572 ns  max: 107041 ns  min: 101993 ns avg_task_post: 103542 ns
+```
+
+e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+
+```
+Benchmark Test Run: 1 Producers 3(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 7809 ns  max: 10467 ns  min: 7453 ns avg_task_post: 7261 ns
+       *std::async (time/task) avg: 139664 ns  max: 3453077 ns  min: 104589 ns avg_task_post: 117819 ns
+    AsioThreadPool (time/task) avg: 6545 ns  max: 8804 ns  min: 5678 ns avg_task_post: 5654 ns
+     *boost::async (time/task) avg: 37629 ns  max: 38978 ns  min: 36769 ns avg_task_post: 36933 ns
+
+Benchmark Test Run: 2 Producers 2(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 2207 ns  max: 4084 ns  min: 1809 ns avg_task_post: 1325 ns
+       *std::async (time/task) avg: 431781 ns  max: 17500817 ns  min: 91919 ns avg_task_post: 407595 ns
+    AsioThreadPool (time/task) avg: 2251 ns  max: 3351 ns  min: 1839 ns avg_task_post: 1405 ns
+     *boost::async (time/task) avg: 48456 ns  max: 50578 ns  min: 46698 ns avg_task_post: 47753 ns
+
+Benchmark Test Run: 3 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 3346 ns  max: 3974 ns  min: 2635 ns avg_task_post: 1017 ns
+       *std::async (time/task) avg: 110853 ns  max: 768224 ns  min: 103045 ns avg_task_post: 86361 ns
+    AsioThreadPool (time/task) avg: 3828 ns  max: 4209 ns  min: 3354 ns avg_task_post: 976 ns
+     *boost::async (time/task) avg: 59094 ns  max: 67042 ns  min: 54802 ns avg_task_post: 58365 ns
+```
+
+### queue benchmark
+The benchmark uses producers-consumers model, and doesn't provide all the detailed measurements.
+* async::bounded_queue
+* async::queue
+* boost::lockfree::queue
+* boost::lockfree::spsc_queue  (only for single-producer-single-consumer test)
+
+e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64)
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 18 ns  max: 55 ns  min: 17 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 50 ns  min: 23 ns
+          async::queue (time/op) avg: 28 ns  max: 66 ns  min: 27 ns
+boost::lockfree::queue (time/op) avg: 167 ns  max: 195 ns  min: 70 ns
+boost::lockfree::spsc_queue (time/op) avg: 10 ns  max: 38 ns  min: 8 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 27 ns  max: 62 ns  min: 25 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 124 ns  min: 24 ns
+          async::queue (time/op) avg: 42 ns  max: 115 ns  min: 29 ns
+boost::lockfree::queue (time/op) avg: 240 ns  max: 576 ns  min: 119 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 44 ns  max: 78 ns  min: 29 ns
+async::queue::bulk(16) (time/op) avg: 34 ns  max: 109 ns  min: 28 ns
+          async::queue (time/op) avg: 90 ns  max: 122 ns  min: 44 ns
+boost::lockfree::queue (time/op) avg: 213 ns  max: 227 ns  min: 161 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 53 ns  max: 82 ns  min: 27 ns
+async::queue::bulk(16) (time/op) avg: 34 ns  max: 107 ns  min: 29 ns
+          async::queue (time/op) avg: 100 ns  max: 114 ns  min: 51 ns
+boost::lockfree::queue (time/op) avg: 197 ns  max: 207 ns  min: 186 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 31 ns  max: 81 ns  min: 25 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 104 ns  min: 28 ns
+          async::queue (time/op) avg: 93 ns  max: 117 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 211 ns  max: 222 ns  min: 162 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 52 ns  max: 79 ns  min: 30 ns
+async::queue::bulk(16) (time/op) avg: 33 ns  max: 103 ns  min: 29 ns
+          async::queue (time/op) avg: 94 ns  max: 126 ns  min: 74 ns
+boost::lockfree::queue (time/op) avg: 199 ns  max: 217 ns  min: 174 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 49 ns  max: 81 ns  min: 35 ns
+async::queue::bulk(16) (time/op) avg: 33 ns  max: 60 ns  min: 28 ns
+          async::queue (time/op) avg: 97 ns  max: 134 ns  min: 51 ns
+boost::lockfree::queue (time/op) avg: 185 ns  max: 198 ns  min: 152 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 36 ns  max: 81 ns  min: 34 ns
+async::queue::bulk(16) (time/op) avg: 30 ns  max: 60 ns  min: 26 ns
+          async::queue (time/op) avg: 48 ns  max: 89 ns  min: 45 ns
+boost::lockfree::queue (time/op) avg: 161 ns  max: 179 ns  min: 120 ns
+```
+
+e.g. MacOS 10.12.5 Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42
+```
+SSingle Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 12 ns  max: 37 ns  min: 12 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 54 ns  min: 25 ns
+          async::queue (time/op) avg: 23 ns  max: 61 ns  min: 23 ns
+boost::lockfree::queue (time/op) avg: 156 ns  max: 172 ns  min: 118 ns
+boost::lockfree::spsc_queue (time/op) avg: 11 ns  max: 30 ns  min: 5 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 84 ns  max: 98 ns  min: 60 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 125 ns  min: 24 ns
+          async::queue (time/op) avg: 104 ns  max: 115 ns  min: 92 ns
+boost::lockfree::queue (time/op) avg: 231 ns  max: 326 ns  min: 213 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 82 ns  max: 100 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 36 ns  max: 108 ns  min: 31 ns
+          async::queue (time/op) avg: 102 ns  max: 122 ns  min: 90 ns
+boost::lockfree::queue (time/op) avg: 192 ns  max: 229 ns  min: 184 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 79 ns  max: 93 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 94 ns  min: 29 ns
+          async::queue (time/op) avg: 98 ns  max: 116 ns  min: 70 ns
+boost::lockfree::queue (time/op) avg: 189 ns  max: 198 ns  min: 175 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 77 ns  max: 146 ns  min: 56 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 92 ns  min: 26 ns
+          async::queue (time/op) avg: 93 ns  max: 167 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 200 ns  max: 218 ns  min: 182 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 76 ns  max: 92 ns  min: 48 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 89 ns  min: 24 ns
+          async::queue (time/op) avg: 97 ns  max: 140 ns  min: 83 ns
+boost::lockfree::queue (time/op) avg: 200 ns  max: 211 ns  min: 163 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 80 ns  max: 98 ns  min: 59 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 97 ns  min: 24 ns
+          async::queue (time/op) avg: 105 ns  max: 122 ns  min: 78 ns
+boost::lockfree::queue (time/op) avg: 182 ns  max: 194 ns  min: 153 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 86 ns  max: 103 ns  min: 64 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 82 ns  min: 23 ns
+          async::queue (time/op) avg: 107 ns  max: 127 ns  min: 91 ns
+boost::lockfree::queue (time/op) avg: 154 ns  max: 180 ns  min: 146 ns
+```
+
+e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 12 ns  max: 71 ns  min: 11 ns
+async::queue::bulk(16) (time/op) avg: 65 ns  max: 134 ns  min: 24 ns
+          async::queue (time/op) avg: 48 ns  max: 107 ns  min: 33 ns
+boost::lockfree::queue (time/op) avg: 179 ns  max: 198 ns  min: 60 ns
+boost::lockfree::spsc_queue (time/op) avg: 7 ns  max: 47 ns  min: 4 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 68 ns  max: 505 ns  min: 35 ns
+async::queue::bulk(16) (time/op) avg: 29 ns  max: 135 ns  min: 25 ns
+          async::queue (time/op) avg: 93 ns  max: 138 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 234 ns  max: 292 ns  min: 208 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 68 ns  max: 106 ns  min: 39 ns
+async::queue::bulk(16) (time/op) avg: 35 ns  max: 117 ns  min: 19 ns
+          async::queue (time/op) avg: 92 ns  max: 135 ns  min: 79 ns
+boost::lockfree::queue (time/op) avg: 193 ns  max: 227 ns  min: 175 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 73 ns  max: 251 ns  min: 49 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 110 ns  min: 26 ns
+          async::queue (time/op) avg: 96 ns  max: 178 ns  min: 70 ns
+boost::lockfree::queue (time/op) avg: 179 ns  max: 359 ns  min: 164 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 81 ns  max: 220 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 114 ns  min: 23 ns
+          async::queue (time/op) avg: 102 ns  max: 159 ns  min: 74 ns
+boost::lockfree::queue (time/op) avg: 177 ns  max: 541 ns  min: 162 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 83 ns  max: 443 ns  min: 53 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 297 ns  min: 23 ns
+          async::queue (time/op) avg: 110 ns  max: 512 ns  min: 79 ns
+boost::lockfree::queue (time/op) avg: 176 ns  max: 505 ns  min: 161 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 83 ns  max: 437 ns  min: 36 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 261 ns  min: 23 ns
+          async::queue (time/op) avg: 112 ns  max: 449 ns  min: 84 ns
+boost::lockfree::queue (time/op) avg: 178 ns  max: 547 ns  min: 164 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 90 ns  max: 805 ns  min: 28 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 78 ns  min: 21 ns
+          async::queue (time/op) avg: 123 ns  max: 695 ns  min: 80 ns
+boost::lockfree::queue (time/op) avg: 195 ns  max: 615 ns  min: 154 ns
+```
+
+e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 67 ns  max: 697 ns  min: 53 ns
+async::queue::bulk(16) (time/op) avg: 144 ns  max: 434 ns  min: 130 ns
+          async::queue (time/op) avg: 141 ns  max: 441 ns  min: 115 ns
+boost::lockfree::queue (time/op) avg: 182 ns  max: 514 ns  min: 168 ns
+boost::lockfree::spsc_queue (time/op) avg: 62 ns  max: 430 ns  min: 53 ns
+
+Benchmark Test Run: 1 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 72 ns  max: 574 ns  min: 59 ns
+async::queue::bulk(16) (time/op) avg: 141 ns  max: 515 ns  min: 116 ns
+          async::queue (time/op) avg: 181 ns  max: 590 ns  min: 134 ns
+boost::lockfree::queue (time/op) avg: 192 ns  max: 1045 ns  min: 172 ns
+
+Benchmark Test Run: 2 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 82 ns  max: 457 ns  min: 65 ns
+async::queue::bulk(16) (time/op) avg: 99 ns  max: 701 ns  min: 84 ns
+          async::queue (time/op) avg: 124 ns  max: 550 ns  min: 108 ns
+boost::lockfree::queue (time/op) avg: 151 ns  max: 847 ns  min: 138 ns
+
+Benchmark Test Run: 3 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 88 ns  max: 538 ns  min: 67 ns
+async::queue::bulk(16) (time/op) avg: 89 ns  max: 717 ns  min: 71 ns
+          async::queue (time/op) avg: 131 ns  max: 631 ns  min: 118 ns
+boost::lockfree::queue (time/op) avg: 165 ns  max: 644 ns  min: 149 ns
+```
+
+e.g. Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) gcc 6.3.0 on Raspberry Pi 3 B+
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 227 ns  max: 912 ns  min: 179 ns
+async::queue::bulk(16) (time/op) avg: 442 ns  max: 1236 ns  min: 365 ns
+          async::queue (time/op) avg: 423 ns  max: 1249 ns  min: 364 ns
+boost::lockfree::queue (time/op) avg: 474 ns  max: 1017 ns  min: 410 ns
+boost::lockfree::spsc_queue (time/op) avg: 70 ns  max: 761 ns  min: 48 ns
+
+Benchmark Test Run: 1 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 241 ns  max: 1482 ns  min: 187 ns
+async::queue::bulk(16) (time/op) avg: 470 ns  max: 1259 ns  min: 354 ns
+          async::queue (time/op) avg: 488 ns  max: 1482 ns  min: 375 ns
+boost::lockfree::queue (time/op) avg: 462 ns  max: 1158 ns  min: 427 ns
+
+
+Benchmark Test Run: 2 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 208 ns  max: 348 ns  min: 158 ns
+async::queue::bulk(16) (time/op) avg: 285 ns  max: 543 ns  min: 237 ns
+          async::queue (time/op) avg: 306 ns  max: 761 ns  min: 234 ns
+boost::lockfree::queue (time/op) avg: 334 ns  max: 1481 ns  min: 261 ns
+
+
+Benchmark Test Run: 3 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 241 ns  max: 884 ns  min: 192 ns
+async::queue::bulk(16) (time/op) avg: 210 ns  max: 651 ns  min: 180 ns
+          async::queue (time/op) avg: 439 ns  max: 682 ns  min: 375 ns
+boost::lockfree::queue (time/op) avg: 420 ns  max: 903 ns  min: 320 ns
+```
+
+## coding style
+all code has been formated by clang-format. It may be more easy to read in text editor or may be not :)
+
+## Many Thanks to 3rd party and their developers
+* [Boost](http://www.boost.org/)
+* [Boost CMake](https://github.com/Orphis/boost-cmake) Easy Boost integration in CMake projects!
+* [Catch](https://github.com/philsquared/Catch) A powerful test framework for unit test.
+* [cpprestsdk](https://github.com/Microsoft/cpprestsdk) The C++ REST SDK is a Microsoft project for cloud-based client-server communication in native code using a modern asynchronous C++ API design.
+* [rlutil](https://github.com/tapio/rlutil) provides cross-platform console-mode functions to position and colorize text.
+* [sakaki](https://github.com/sakaki-/gentoo-on-rpi3-64bit) Bootable 64-bit Gentoo image for the Raspberry Pi 3 B / B+, with Linux 4.14
diff --git a/src/3rdparty/async/bounded_queue.h b/src/3rdparty/async/bounded_queue.h
new file mode 100644
index 000000000..341e5f307
--- /dev/null
+++ b/src/3rdparty/async/bounded_queue.h
@@ -0,0 +1,342 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "utility.h"
+#include <atomic>
+#include <cassert>
+#include <limits>
+
+namespace async {
+
+struct bounded_traits {
+  static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
+  static constexpr std::size_t CachelineSize = 64;
+  static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
+  using sequence_type = std::uint64_t;
+};
+
+template <typename T, typename TRAITS = bounded_traits> class bounded_queue {
+private:
+  static_assert(std::is_nothrow_destructible<T>::value,
+                "T must be nothrow destructible");
+
+public:
+  static constexpr std::size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment;
+  using seq_t = typename TRAITS::sequence_type;
+  explicit bounded_queue(std::size_t size)
+      : fastmodulo((size > 0 && ((size & (size - 1)) == 0))),
+        bitshift(fastmodulo ? getShiftBitsCount(size) : 0),
+        elements(new element[size]), mask(fastmodulo ? size - 1 : 0),
+        qsize(size), enqueueIx(0), dequeueIx(0) {
+    assert(qsize > 0); // any size <= 0 is illegal
+  }
+  bounded_queue(bounded_queue const &) = delete;
+  bounded_queue(bounded_queue &&) = delete;
+  bounded_queue &operator=(bounded_queue const &) = delete;
+  bounded_queue &operator=(bounded_queue &&) = delete;
+  ~bounded_queue() { delete[] elements; }
+  std::size_t size() { return qsize; }
+
+  template <typename... Args, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline void blocking_enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(enqidx)];
+    auto enq_tkt = ticket(enqidx);
+    while (enq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    ele.construct(std::forward<Args>(args)...);
+    ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline bool blocking_enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(enqidx)];
+    auto enq_tkt = ticket(enqidx);
+    while (enq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    if (ele.construct(std::forward<Args>(args)...)) {
+      ele.hasdata.store(true, std::memory_order_release);
+      ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+      return true;
+    } else {
+      ele.hasdata.store(false, std::memory_order_release);
+      ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+      return false;
+    }
+  }
+
+  template <typename... Args, // NON-SAFE
+            typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                    std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(enqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t enq_tkt = ticket(enqidx);
+      seq_t diff = tkt - enq_tkt;
+      if (diff == 0) {
+        if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed)) {
+          ele.construct(std::forward<Args>(args)...);
+          ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+          return true;
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is full
+      else
+        enqidx = enqueueIx.load(std::memory_order_acquire);
+    }
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                    !std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    for (;;) {
+      auto &ele = elements[index(enqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t enq_tkt = ticket(enqidx);
+      seq_t diff = tkt - enq_tkt;
+      if (diff == 0) {
+        if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed)) {
+          if (ele.construct(std::forward<Args>(args)...)) {
+            ele.hasdata.store(true, std::memory_order_release);
+            ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+            return true;
+          } else {
+            ele.hasdata.store(false, std::memory_order_release);
+            ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+            return false;
+          }
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is full
+      else
+        enqidx = enqueueIx.load(std::memory_order_acquire);
+    }
+  }
+
+  template <typename U = T, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<U>::value>::type>
+  inline void blocking_dequeue(U &data) noexcept {
+    auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(deqidx)];
+    seq_t deq_tkt = ticket(deqidx) + 1;
+    while (deq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    ele.move(data);
+    ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+  }
+
+  template <typename U = T, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<U>::value>::type>
+  inline bool blocking_dequeue(U &data) noexcept {
+    auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(deqidx)];
+    seq_t deq_tkt = ticket(deqidx) + 1;
+    while (deq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    if (ele.hasdata.load(std::memory_order_acquire)) {
+      ele.move(data);
+      ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+      return true;
+    } else {
+      ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+      return false;
+    }
+  }
+
+  template <typename U = T, // NON-SAFE
+            typename std::enable_if<!TRAITS::NOEXCEPT_CHECK ||
+                                        std::is_nothrow_constructible<U>::value,
+                                    int>::type = 0>
+  inline bool dequeue(U &data) {
+
+    auto deqidx = dequeueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(deqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t deq_tkt = ticket(deqidx) + 1;
+      seq_t diff = tkt - deq_tkt;
+      if (diff == 0) {
+        if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1,
+                                              std::memory_order_acq_rel,
+                                              std::memory_order_relaxed)) {
+          ele.move(data);
+          ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+          return true;
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is empty
+      else {
+
+        deqidx = dequeueIx.load(std::memory_order_acquire);
+      }
+    }
+  }
+
+  template <
+      typename U = T, // SAFE-IMPL
+      typename std::enable_if<TRAITS::NOEXCEPT_CHECK &&
+                                  !std::is_nothrow_constructible<U>::value,
+                              int>::type = 0>
+  inline bool
+  dequeue(U &data) // false could be queue is empty, or skip an invalid element
+  {
+
+    auto deqidx = dequeueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(deqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t deq_tkt = ticket(deqidx) + 1;
+      seq_t diff = tkt - deq_tkt;
+      if (diff == 0) {
+        if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1,
+                                              std::memory_order_acq_rel,
+                                              std::memory_order_relaxed)) {
+          if (ele.hasdata.load(std::memory_order_acquire)) {
+            ele.move(data);
+            ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+            return true;
+          } else {
+            ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+            return false;
+          }
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is empty
+      else {
+        deqidx = dequeueIx.load(std::memory_order_acquire);
+      }
+    }
+  }
+
+private:
+  inline seq_t index(seq_t const seq) {
+    if (fastmodulo)
+      return seq & mask;
+    else
+      return seq >= qsize ? seq % qsize : seq;
+  }
+
+  inline seq_t ticket(seq_t const seq) {
+    if (fastmodulo)
+      return (seq >> bitshift) << 1;
+    else
+      return (seq / static_cast<seq_t>(qsize)) << 1;
+  }
+  //TODO& Review: replace the following with c++ concepts
+  template <typename U = T, typename Enable = void> struct checkdata {};
+
+  template <typename U>
+  struct checkdata<U, typename std::enable_if<
+                          !TRAITS::NOEXCEPT_CHECK ||
+                          std::is_nothrow_constructible<U>::value>::type> {};
+
+  template <typename U>
+  struct checkdata<U, typename std::enable_if<
+                          TRAITS::NOEXCEPT_CHECK &&
+                          !std::is_nothrow_constructible<U>::value>::type> {
+    checkdata() : hasdata(false) {}
+    std::atomic<bool> hasdata;
+  };
+
+  struct element : public checkdata<T> {
+    element() : tkt(0) {}
+    ~element() {
+      if (tkt & 1) // enqueue op visited
+        destruct();
+    }
+
+    template <typename... Args, // NON-SAFE
+              typename = typename std::enable_if<
+                  !TRAITS::NOEXCEPT_CHECK ||
+                  std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline void construct(Args &&... args) noexcept {
+      new (&storage) T(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args, // SAFE-IMPL
+              typename = typename std::enable_if<
+                  TRAITS::NOEXCEPT_CHECK &&
+                  !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline bool construct(Args &&... args) noexcept {
+      try {
+        new (&storage) T(std::forward<Args>(args)...);
+      } catch (...) {
+        return false;
+      }
+      return true;
+    }
+
+    inline void destruct() noexcept { reinterpret_cast<T *>(&storage)->~T(); }
+
+    inline T *getptr() { return reinterpret_cast<T *>(&storage); }
+
+    template <
+        typename U = T, // NON-SAFE
+        typename std::enable_if<!TRAITS::NOEXCEPT_CHECK ||
+                                    std::is_nothrow_move_assignable<U>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      data = std::move(*getptr());
+      destruct();
+    }
+
+    template <
+        typename U = T, // SAFE-IMPL
+        typename std::enable_if<TRAITS::NOEXCEPT_CHECK &&
+                                    !std::is_nothrow_move_assignable<U>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      try {
+        data = std::move(*getptr());
+      } catch (...) {
+      }
+      destruct();
+    }
+
+    std::atomic<seq_t> tkt;
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type storage;
+    std::atomic<bool> hasdata;
+  };
+
+  bool const fastmodulo;   // true if qsize is power of 2
+  int const bitshift;      // used if fastmodulo is true
+  element *const elements; // pointer to buffer
+  std::size_t const mask;       // used if fastmodulo is true
+  std::size_t const qsize;      // queue size
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> enqueueIx;
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> dequeueIx;
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
+};
+} // namespace async
diff --git a/src/3rdparty/async/queue.h b/src/3rdparty/async/queue.h
new file mode 100644
index 000000000..6b00d1d61
--- /dev/null
+++ b/src/3rdparty/async/queue.h
@@ -0,0 +1,429 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+#include "utility.h"
+#include <array>
+#include <atomic>
+#include <memory>
+
+namespace async {
+struct traits // 3-level (L3, L2, L1) depth of nested group design, total
+              // indexing space is pow(2, 64-Tagbits)
+{             // user can change the bits settings by providing your own TRAITS
+  static constexpr std::uint64_t Tagbits = 24;
+  static constexpr std::uint64_t L3bits = 10;
+  static constexpr std::uint64_t L2bits = 10;
+  static constexpr std::uint64_t L1bits = 12;
+  static constexpr std::uint64_t Basebits = 8;
+  static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
+  static constexpr std::size_t CachelineSize = 64;
+  static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
+};
+
+template <typename T, typename TRAITS = traits> class queue final {
+public:
+  static bool is_lock_free_v() {
+    return std::atomic<std::uint64_t>{}.is_lock_free();
+  }
+  static constexpr std::size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment;
+  static constexpr std::uint64_t BaseMask = getBitmask<std::uint64_t>(TRAITS::Basebits);
+  static constexpr std::uint64_t L1Mask = getBitmask<std::uint64_t>(TRAITS::L1bits)
+                                     << TRAITS::Basebits;
+  static constexpr std::uint64_t L2Mask = getBitmask<std::uint64_t>(TRAITS::L2bits)
+                                     << (TRAITS::Basebits + TRAITS::L1bits);
+  static constexpr std::uint64_t L3Mask =
+      getBitmask<std::uint64_t>(TRAITS::L3bits)
+      << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits);
+  static constexpr std::uint64_t TagMask =
+      getBitmask<std::uint64_t>(TRAITS::Tagbits)
+      << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits + TRAITS::L3bits);
+  static constexpr std::uint64_t TagShift = 64 - TRAITS::Tagbits;
+  static constexpr std::uint64_t TagPlus1 = static_cast<std::uint64_t>(1) << TagShift;
+
+public: // assert bits settings meet requirements
+  static_assert(TRAITS::Tagbits + TRAITS::L3bits + TRAITS::L2bits +
+                        TRAITS::L1bits + TRAITS::Basebits ==
+                    64,
+                "The sum of all bits settings should be 64");
+  static_assert(TRAITS::Tagbits > 0 && TRAITS::L3bits > 0 &&
+                    TRAITS::L2bits > 0 && TRAITS::L1bits > 0 &&
+                    TRAITS::Basebits > 3,
+                "All bits settings should be > 0 and Basebits must be > 3");
+  static_assert(std::is_nothrow_destructible<T>::value,
+                "T must be nothrow destructible");
+
+public:
+  queue() : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) {
+    container.get(index(0)); // allocate initial space
+  }
+  queue(std::size_t size) // pre-allocate size
+      : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) {
+    container.get(index(0));
+
+    if (size > (static_cast<std::uint64_t>(1) << TRAITS::Basebits)) {
+      index ix;
+      for (std::size_t i = (static_cast<std::uint64_t>(1) << TRAITS::Basebits); i < size;
+           ++i) {
+        auto &node = getNode(ix);
+        recycle(ix);
+      }
+    }
+  }
+
+  queue(queue const &other) = delete;
+  queue &operator=(queue const &other) = delete;
+  queue(queue &&other) = delete;
+  queue &operator=(queue &&other) = delete;
+
+  template <typename... Args, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline void enqueue(Args &&... args) noexcept {
+    auto ix = encapsulate(std::forward<Args>(args)...);
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(ix, std::memory_order_release);
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto ix = encapsulate(std::forward<Args>(args)...);
+    if (ix == 0)
+      return false;
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(ix, std::memory_order_release);
+    return true;
+  }
+
+  template <typename IT> void bulk_enqueue(IT it, std::size_t count) {
+    index firstidx(0), preidx(0), lastidx(0);
+    for (std::size_t i = 0; i < count; ++i) {
+      lastidx = encapsulate(*it++);
+      if (firstidx == 0)
+        firstidx = lastidx;
+      if (preidx != 0) {
+        container[preidx].next.store(lastidx, std::memory_order_relaxed);
+      }
+      preidx = lastidx;
+    }
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, lastidx, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(firstidx, std::memory_order_release);
+  }
+
+  template <typename IT>
+  std::size_t bulk_dequeue(IT &&it, std::size_t maxcount) // or IT& it to return the
+  {
+    std::size_t count(0);
+    while (maxcount-- && dequeue(*it++)) {
+      ++count;
+    }
+    return count;
+  }
+
+  template <typename U> // U could be T, or any kinds of iterators/adapters,
+                        // like insert_iterator
+  inline bool dequeue(U &data) noexcept // return false if queue is empty
+  {
+    for (;;) {
+      auto deqidx = dequeueIx.load(std::memory_order_acquire);
+      auto &node = container[deqidx];
+      auto next = node.next.load(std::memory_order_relaxed);
+      if (next == 0) {
+        auto ready_for_consume =
+            node.consume_ready.load(std::memory_order_relaxed);
+        if (!ready_for_consume) {
+          return false;
+        }
+
+        if (node.consume_ready.compare_exchange_strong(
+                ready_for_consume, false, std::memory_order_release,
+                std::memory_order_relaxed)) {
+          node.template move<TRAITS>(data);
+          return true;
+        }
+      } else {
+        if (dequeueIx.compare_exchange_weak(deqidx, next,
+                                            std::memory_order_acq_rel,
+                                            std::memory_order_relaxed)) {
+          auto ready_for_consume =
+              node.consume_ready.load(std::memory_order_acquire);
+          if (ready_for_consume &&
+              node.consume_ready.compare_exchange_strong(
+                  ready_for_consume, false, std::memory_order_release,
+                  std::memory_order_relaxed)) {
+            node.template move<TRAITS>(data);
+          } else { // the node is being consumed by another thread, waiting for
+                   // it finishes
+            for (; !node.recycle_ready.load(std::memory_order_acquire);) {
+            }
+          }
+          node.next.store(
+              0, std::memory_order_relaxed); // reset link to avoid chain effect
+          recycle(deqidx);
+          if (ready_for_consume)
+            return ready_for_consume;
+        }
+      }
+    }
+  }
+  std::uint64_t getNodeCount() { return nodeCount; } // get in-use-nodes count
+
+private:       // internal data structures
+  struct index // simulate tagged pointer
+  {
+    index(std::uint64_t newval) noexcept
+        : value(newval) {} // is_trivially_copyable must be true
+    index() noexcept : value(0) {}
+    inline operator std::uint64_t() const { return value; }
+    std::uint64_t getVersion() { return (value & TagMask) >> TagShift; }
+    inline void increTag() {
+      value = (value & ~TagMask) | ((value + TagPlus1) & TagMask);
+    }
+    std::uint64_t value;
+  };
+
+  struct node // to store the data
+  {
+    node() : next(0), consume_ready(false), recycle_ready(true) {}
+    ~node() noexcept {
+      if (consume_ready.load(std::memory_order_relaxed)) {
+        destruct();
+      }
+    }
+
+    template <typename... Args, // NON-SAFE
+              typename = typename std::enable_if<
+                  !TRAITS::NOEXCEPT_CHECK ||
+                  std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline void construct(Args &&... args) noexcept {
+      new (&storage) T(std::forward<Args>(args)...);
+      consume_ready.store(true, std::memory_order_release);
+      recycle_ready.store(false, std::memory_order_release);
+    }
+
+    template <typename... Args, // SAFE-IMPL
+              typename = typename std::enable_if<
+                  TRAITS::NOEXCEPT_CHECK &&
+                  !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline bool construct(Args &&... args) noexcept {
+      try {
+        new (&storage) T(std::forward<Args>(args)...);
+      } catch (...) {
+        return false;
+      }
+
+      consume_ready.store(true, std::memory_order_release);
+      recycle_ready.store(false, std::memory_order_release);
+      return true;
+    }
+
+    inline void destruct() noexcept { reinterpret_cast<T *>(&storage)->~T(); }
+
+    template <
+        typename TR, typename U, // NON-SAFE
+        typename std::enable_if<!TR::NOEXCEPT_CHECK ||
+                                    std::is_nothrow_move_assignable<T>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      data = std::move(*getptr());
+      destruct();
+      recycle_ready.store(true, std::memory_order_release);
+    }
+
+    template <
+        typename TR, typename U, // SAFE-IMPL
+        typename std::enable_if<TR::NOEXCEPT_CHECK &&
+                                    !std::is_nothrow_move_assignable<T>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      try {
+        data = std::move(*getptr());
+      } catch (...) {
+      }
+      destruct();
+      recycle_ready.store(true, std::memory_order_release);
+    }
+    inline T *getptr() { return reinterpret_cast<T *>(&storage); }
+    std::atomic<index> next;         // link
+    std::atomic<bool> consume_ready; // if true, consume ready
+    std::atomic<bool> recycle_ready; // if true, recycle ready
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type storage; // data
+  };
+
+  struct basecontainer {
+    inline node &get(index const &ix) { return operator[](ix); }
+    inline node &at(index const &ix) { return operator[](ix); }
+    inline node &operator[](index const &ix) { return nodes[ix & BaseMask]; }
+    std::array<node, static_cast<std::uint64_t>(1) << TRAITS::Basebits> nodes;
+  };
+
+  template <typename SubGroup, std::uint64_t BitMask> struct nestedcontainer {
+    static constexpr std::uint64_t mask = BitMask;
+    static constexpr std::uint64_t bits = getSetBitsCount(mask);
+    static constexpr std::uint64_t shift = getShiftBitsCount(mask);
+    std::array<std::atomic<SubGroup *>, static_cast<std::uint64_t>(1) << bits>
+        subgroups;
+    nestedcontainer() {
+      for (auto &gptr : subgroups) {
+        gptr.store(nullptr, std::memory_order_release);
+      }
+    }
+    ~nestedcontainer() {
+      for (auto &gptr : subgroups) {
+        if (gptr.load(std::memory_order_relaxed) != nullptr)
+          delete gptr.load(std::memory_order_relaxed);
+      }
+    }
+
+    inline node &get(index const &ix) // will trigger the new operation if
+                                      // subgroup doesn't exist
+    {
+      auto ptr =
+          subgroups[(ix & mask) >> shift].load(std::memory_order_acquire);
+      if (ptr == nullptr) {
+        auto newgroup = std::make_unique<SubGroup>(); // if ComExch fails,
+                                                      // unique_ptr will self
+                                                      // delete
+        if (subgroups[(ix & mask) >> shift].compare_exchange_strong(
+                ptr, newgroup.get(), std::memory_order_release,
+                std::memory_order_acquire)) {
+          ptr = newgroup.release();
+        }
+      }
+      return ptr->get(ix); // recursively calling get 'til get the node
+    }
+
+    inline node &operator[](index const &ix) {
+      return subgroups[(ix & mask) >> shift]
+          .load(std::memory_order_relaxed)
+          ->
+          operator[](ix);
+    }
+
+    inline node &at(index const &ix) { // balanced performance and safety
+      auto ptr =
+          subgroups[(ix & mask) >> shift].load(std::memory_order_relaxed);
+      if (ptr)
+        return ptr->at(ix);
+      else
+        return get(ix);
+    }
+  };
+
+  inline node &getNode(index &ix) { // return an existing or new node
+    #if defined(__arm__) && (!defined(__aarch64__))
+    //for ARMV7 or below
+    ix.value = nodeCount.load(std::memory_order_relaxed);
+    auto val = ix.value + 1;
+    while(!nodeCount.compare_exchange_weak(
+      ix.value, val, std::memory_order_release, std::memory_order_relaxed)) {
+        val = ix.value + 1;
+    }
+    #else
+    ix.value = nodeCount.fetch_add(static_cast<std::uint64_t>(1),
+                              std::memory_order_relaxed);
+    #endif
+    if ((ix.value & BaseMask) == 0)
+      return container.get(ix);
+    else
+      return container.at(ix);
+  }
+
+  template <typename... Args, // NON-SAFE
+            typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                    std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline index encapsulate(Args &&... args) noexcept {
+    auto ix = spawn();
+    auto &node = container[ix];
+    node.construct(std::forward<Args>(args)...);
+    node.next.store(0, std::memory_order_relaxed);
+    return ix;
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                    !std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline index encapsulate(Args &&... args) noexcept {
+    auto ix = spawn();
+    auto &node = container[ix];
+    node.next.store(0, std::memory_order_relaxed);
+    if (node.construct(std::forward<Args>(args)...))
+      return ix;
+    else {
+      recycle(ix); // construction failed, recycle the node
+      return index(0);
+    }
+  }
+
+  inline void recycle(index const &ix) {
+    auto recycle = recycleIx.load(std::memory_order_relaxed);
+    while (!recycleIx.compare_exchange_weak(
+        recycle, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[recycle].next.store(ix, std::memory_order_release);
+  }
+
+  inline auto spawn()
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER < 1800)
+      -> index
+#endif
+  {
+    index ix(0);
+    for (;;) {
+      auto spaidx = spawnIx.load(std::memory_order_acquire);
+      auto next = container[spaidx].next.load(std::memory_order_relaxed);
+      if (next == 0) {
+        getNode(ix);
+        return ix;
+      } else {
+        if (spawnIx.compare_exchange_weak(spaidx, next,
+                                          std::memory_order_acq_rel,
+                                          std::memory_order_relaxed)) {
+          if (spaidx != 0) {
+            spaidx.increTag();
+          }
+          return spaidx;
+        }
+      }
+    }
+  }
+ 
+  using L1container = nestedcontainer<basecontainer, L1Mask>;
+  using L2container = nestedcontainer<L1container, L2Mask>;
+  nestedcontainer<L2container, L3Mask> container;
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<std::uint64_t> nodeCount; // # of allocated nodes, not the #
+                                                                // of elements stored in the queue
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> dequeueIx;    // dequeue pointer
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> enqueueIx;    // enqueue pointer
+  alignas(cacheline_alignment) char cacheline_padding4[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> spawnIx;      // spawn pointer
+  alignas(cacheline_alignment) char cacheline_padding5[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> recycleIx;    // recycle pointer
+  alignas(cacheline_alignment) char cacheline_padding6[cacheline_size];
+};
+} // namespace async
diff --git a/src/3rdparty/async/threadpool.h b/src/3rdparty/async/threadpool.h
new file mode 100644
index 000000000..395a9d850
--- /dev/null
+++ b/src/3rdparty/async/threadpool.h
@@ -0,0 +1,192 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+#include "queue.h"
+#include <atomic>
+#include <functional>
+#include <future>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+namespace async {
+// thread pool to execute functions, functors, lamdas asynchronously,
+// default poolsize = machine's logical CPU cores/threads
+class threadpool final {
+public:
+  static int defaultpoolsize() { return std::thread::hardware_concurrency(); }
+
+  threadpool(int poolsize = defaultpoolsize())
+      : idlecount(0), conflag(false) {
+    configurepool(poolsize);
+  }
+
+  threadpool(const threadpool &) = delete;
+  threadpool(threadpool &&) = delete;
+  threadpool &operator=(const threadpool &) = delete;
+  threadpool &operator=(threadpool &&) = delete;
+
+  ~threadpool() { cleanup(); }
+
+  inline std::size_t size() {
+    std::lock_guard<std::mutex> lg(poolmux);
+    return threads.size();
+  }
+
+  inline int idlesize() { return idlecount; }
+
+  // can be called to resize the pool at any time after construction and before
+  // destruction, recommand to be called from main thread or manager thread even
+  // though it is thread-safe
+  void configurepool(std::size_t poolsize) {
+    std::unique_lock<std::mutex> veclk(poolmux);
+    auto currentsize = threads.size();
+    if (currentsize < poolsize) { // expand the pool
+      for (std::size_t i = currentsize; i < poolsize; i++) {
+        tpstops.emplace_back(addthread());
+      }
+    } else if (currentsize > poolsize) { // shrink the pool
+      std::vector<std::unique_ptr<std::thread>> dumpthreads;
+      std::vector<std::atomic<bool> *> dumpthreadstops;
+      std::move(threads.begin() + poolsize, threads.end(),
+                std::back_inserter(dumpthreads));
+      std::move(tpstops.begin() + poolsize, tpstops.end(),
+                std::back_inserter(dumpthreadstops));
+      tpstops.resize(poolsize);
+      threads.resize(poolsize);
+      veclk.unlock();
+      for (auto &a : dumpthreadstops) {
+        *a = true;
+      }
+      for (auto &t : dumpthreads) {
+        t->detach();
+      }
+      {
+        std::unique_lock<std::mutex> lk(qcvmux); // suspended threads to quit
+        qcv.notify_all();
+      }
+    }
+  }
+
+  template <typename Func, typename... Args>
+  inline auto post(Func &&func, Args &&... args)
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER <= 1800)
+      -> std::future<typename std::result_of<Func(Args...)>::type>
+#endif
+  { // TODO: replace result_of with invoke_result_t when migrate to c++17
+    auto taskptr = std::make_shared<
+        std::packaged_task<typename std::result_of<Func(Args...)>::type()>>(
+        std::bind(std::forward<Func>(func), std::forward<Args>(args)...));
+    taskqueue.enqueue([taskptr]() { (*taskptr)(); });
+    {
+      std::lock_guard<std::mutex> lg(qcvmux);
+      conflag = true;
+    }
+    qcv.notify_one();
+    return taskptr->get_future();
+  }
+
+  template <typename Func>
+  inline auto post(Func &&func)
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER <= 1800)
+      -> std::future<typename std::result_of<Func()>::type>
+#endif
+  { // a special case for func() type without any parameters, might be
+    // removed later
+    auto taskptr = std::make_shared<
+        std::packaged_task<typename std::result_of<Func()>::type()>>(
+        std::forward<Func>(func));
+    taskqueue.enqueue([taskptr]() { (*taskptr)(); });
+    {
+      std::lock_guard<std::mutex> lg(qcvmux);
+      conflag = true;
+    }
+    qcv.notify_one();
+    return taskptr->get_future();
+  }
+
+private:
+  struct executor {
+    executor(std::unique_ptr<std::atomic<bool>> &&ptr, threadpool &pool)
+        : stop(std::move(ptr)), thpool(pool) {}
+    void operator()() {
+      while (!*stop) {
+        if (!thpool.executetask_in_loop(*stop)) {
+          return; // signaled to quit
+        }
+        thpool.wait_for_task(*stop); // wait for new task
+      }
+    }
+
+  private:
+    std::unique_ptr<std::atomic<bool>> stop;
+    threadpool &thpool;
+  };
+
+  std::atomic<bool> *addthread() {
+    auto stopuniptr = std::make_unique<std::atomic<bool>>(false);
+    auto stoprawptr = stopuniptr.get();
+    threads.emplace_back(
+        std::make_unique<std::thread>(executor(std::move(stopuniptr), *this)));
+    return stoprawptr;
+  }
+
+  void cleanup() { // make sure no more tasks being pushed to the taskqueue
+    {
+      std::lock_guard<std::mutex> lk(qcvmux);
+      qcv.notify_all(); // let running thread drain the task queue? no need,
+                        // should be removed
+    }
+    for (auto &stop : tpstops) {
+      *stop = true; // stop signaled
+    }
+    {
+      std::lock_guard<std::mutex> lk(qcvmux);
+      qcv.notify_all(); // notify again
+    }
+    for (auto &thread : threads) {
+      if (thread->joinable())
+        thread->join();
+    }
+    threads.clear();
+    tpstops.clear();
+  }
+
+  inline void wait_for_task(std::atomic<bool> const &stop) {
+    idlecount.fetch_add(1, std::memory_order_relaxed);
+    {
+      std::unique_lock<std::mutex> lk(qcvmux);
+      qcv.wait(lk, [&]() {
+        return conflag || stop.load(std::memory_order_acquire);
+      }); //memory_oder can be removed
+      conflag = false;
+    }
+    idlecount.fetch_sub(1, std::memory_order_relaxed);
+  }
+
+  inline bool executetask_in_loop(std::atomic<bool> const &stop) {
+    std::function<void()> func;
+    for (; taskqueue.dequeue(func);) {
+      func();
+      if (stop) // stop is signaled
+        return false;
+    }
+    return true;
+  }
+
+  std::vector<std::unique_ptr<std::thread>> threads;
+  std::vector<std::atomic<bool> *> tpstops; // threads terminate flags
+  async::queue<std::function<void()>> taskqueue;
+  std::atomic<int> idlecount; // idle thread count
+  std::mutex qcvmux, poolmux;
+  std::condition_variable qcv;
+  bool conflag; // continue flag for cv
+};
+} // namespace async
diff --git a/src/3rdparty/async/utility.h b/src/3rdparty/async/utility.h
new file mode 100644
index 000000000..f5bb2d1f4
--- /dev/null
+++ b/src/3rdparty/async/utility.h
@@ -0,0 +1,66 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus < 201103L) ||    \
+    (defined(_MSC_VER) && _MSC_VER < 1800)
+#error This library needs at least a C++11 compliant compiler
+#endif
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <string>
+template <typename T> static constexpr T getBitmask(unsigned int const bits) {
+  return static_cast<T>(-(bits != 0)) &
+         (static_cast<T>(-1) >> ((sizeof(T) * CHAR_BIT) - bits));
+}
+
+#if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1910)
+// c++14 impl
+static constexpr unsigned int getSetBitsCount(std::uint64_t n) {
+  unsigned int count{0};
+  while (n) {
+    n &= (n - 1);
+    count++;
+  }
+  return count;
+}
+
+static constexpr unsigned int getShiftBitsCount(std::uint64_t n) {
+  // requires c++14
+  unsigned int count{0};
+  if (n == 0)
+    return count;
+  while ((n & 0x1) == 0) {
+    n >>= 1;
+    ++count;
+  }
+  return count;
+}
+
+#elif __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
+// c++11 impl
+static constexpr unsigned int getSetBitsCount(std::uint64_t n) {
+  return n == 0 ? 0 : 1 + getSetBitsCount(n & (n - 1));
+}
+
+static constexpr unsigned int getShiftBitsCount(std::uint64_t n) {
+  return n == 0 ? 0 : ((n & 0x1) == 0 ? 1 + getShiftBitsCount(n >> 1) : 0);
+}
+
+#if (__cplusplus == 201103L) && (defined(__clang__) || defined(__GNUC__))
+namespace std { // for c+11
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+} // namespace std
+#endif
+
+#else
+#error This library needs at least a C++11 compliant compiler
+#endif
diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h
index 520820c02..e25260909 100644
--- a/src/TNL/Containers/ByteArraySynchronizer.h
+++ b/src/TNL/Containers/ByteArraySynchronizer.h
@@ -12,7 +12,13 @@
 
 #pragma once
 
+#include <future>
+// 3rd-party async library providing a thread-pool
+#include <async/threadpool.h>
+
 #include <TNL/Containers/ArrayView.h>
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Timer.h>
 
 namespace TNL {
 namespace Containers {
@@ -20,12 +26,121 @@ namespace Containers {
 template< typename Device, typename Index >
 class ByteArraySynchronizer
 {
+private:
+   // NOTE: async::threadpool has alignment requirements, which causes problems:
+   //  - it may become misaligned in derived classes, see e.g.
+   //    https://stackoverflow.com/a/46475498
+   //    solution: specify it as the first member of the base class
+   //  - operator new before C++17 may not support over-aligned types, see
+   //    https://stackoverflow.com/a/53485295
+   //    solution: relaxed alignment requirements to not exceed the value of
+   //    alignof(std::max_align_t), which is the strongest alignment supported
+   //    by plain new. See https://github.com/d36u9/async/pull/2
+   async::threadpool tp;
+
+   int gpu_id = 0;
+
 public:
    using ByteArrayView = ArrayView< std::uint8_t, Device, Index >;
+   using RequestsVector = std::vector< typename Communicators::MpiCommunicator::Request >;
+
+   enum class AsyncPolicy {
+      synchronous,
+      deferred,
+      threadpool,
+      async,
+   };
+
+   ByteArraySynchronizer() : tp(1) {}
+
+   virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) = 0;
+
+   virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) = 0;
+
+   /**
+    * \brief An asynchronous version of \ref synchronizeByteArray.
+    *
+    * Note that this method is not thread-safe - only the thread which created
+    * and "owns" the instance of this object can call this method.
+    *
+    * Note that at most one async operation may be active at a time, the
+    * following calls will block until the pending operation is finished.
+    */
+   void synchronizeByteArrayAsync( ByteArrayView array, int bytesPerValue, AsyncPolicy policy = AsyncPolicy::synchronous )
+   {
+      // wait for any previous synchronization (multiple objects can share the
+      // same synchronizer)
+      if( async_op.valid() ) {
+         async_wait_before_start_timer.start();
+         async_op.wait();
+         async_wait_before_start_timer.stop();
+      }
 
-   virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) = 0;
+      async_start_timer.start();
+
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< Device, Devices::Cuda >::value )
+         cudaGetDevice(&gpu_id);
+      #endif
+
+      if( policy == AsyncPolicy::threadpool || policy == AsyncPolicy::async ) {
+         // everything offloaded to a separate thread
+         auto worker = [=] () {
+            // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+            #ifdef HAVE_CUDA
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaSetDevice(this->gpu_id);
+            #endif
+
+            this->synchronizeByteArray( array, bytesPerValue );
+         };
+
+         if( policy == AsyncPolicy::threadpool )
+            async_op = tp.post( worker );
+         else
+            async_op = std::async( std::launch::async, worker );
+      }
+      else if( policy == AsyncPolicy::deferred ) {
+         // immediate start, deferred synchronization (but still in the same thread)
+         auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+         auto worker = [requests] () mutable {
+            Communicators::MpiCommunicator::WaitAll( requests.data(), requests.size() );
+         };
+         this->async_op = std::async( std::launch::deferred, worker );
+      }
+      else {
+         // synchronous
+         synchronizeByteArray( array, bytesPerValue );
+      }
+
+      async_ops_count++;
+      async_start_timer.stop();
+   }
 
    virtual ~ByteArraySynchronizer() = default;
+
+   /**
+    * \brief Can be used for checking if a synchronization started
+    * asynchronously has been finished.
+    *
+    * Note that derived classes *must* make this check in the destructor,
+    * otherwise running \ref synchronizeByteArrayAsync would lead to the error
+    * `pure virtual method called` when the derived object is destructed before
+    * the async operation finishes. This cannot be implemented in the base class
+    * destructor, because the derived destructor is run first.
+    *
+    *    ~Derived()
+    *    {
+    *       if( this->async_op.valid() )
+    *          this->async_op.wait();
+    *    }
+    */
+   std::future< void > async_op;
+
+   // attributes for profiling
+   Timer async_wait_before_start_timer, async_start_timer, async_wait_timer;
+   std::size_t async_ops_count = 0;
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index c1571bc9e..33e96ca9a 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -49,6 +49,8 @@ public:
    using Self = DistributedArray< _Value, _Device, _Index, _Communicator >;
 
 
+   ~DistributedArray();
+
    DistributedArray() = default;
 
    // Copy-constructor does deep copy.
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index c23d0a7e4..cd0eb49d5 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -20,6 +20,18 @@
 namespace TNL {
 namespace Containers {
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+DistributedArray< Value, Device, Index, Communicator >::
+~DistributedArray()
+{
+   // Wait for pending async operation, otherwise the synchronizer would crash
+   // if the array goes out of scope.
+   waitForSynchronization();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index bf63f8cc6..0a9aef1a4 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -51,6 +51,8 @@ public:
    using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >;
 
 
+   ~DistributedArrayView();
+
    // Initialization by raw data
    DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
    : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData)
@@ -107,6 +109,8 @@ public:
 
    int getValuesPerElement() const;
 
+   // Note that this method is not thread-safe - only the thread which created
+   // and "owns" the instance of this object can call this method.
    void startSynchronization();
 
    void waitForSynchronization() const;
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index cb95427fc..65654a54d 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -17,6 +17,20 @@
 namespace TNL {
 namespace Containers {
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Communicator >
+DistributedArrayView< Value, Device, Index, Communicator >::
+~DistributedArrayView()
+{
+   // Wait for pending async operation, otherwise the synchronizer might crash
+   // if the view goes out of scope.
+   // (The same thing is done even in DistributedArray, but there might be views
+   // bound to an array without a synchronizer, in which case this helps.)
+   waitForSynchronization();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -234,14 +248,9 @@ startSynchronization()
    // like linear solvers...)
    TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" );
 
-   // wait for any previous synchronization (in case the array was inconsistently modified
-   // while a synchronization was in progress)
-   waitForSynchronization();
-
    typename SynchronizerType::ByteArrayView bytes;
    bytes.bind( reinterpret_cast<std::uint8_t*>( localData.getData() ), sizeof(ValueType) * localData.getSize() );
-   // TODO: implement the async stuff
-   synchronizer->synchronizeByteArray( bytes, sizeof(ValueType) * valuesPerElement );
+   synchronizer->synchronizeByteArrayAsync( bytes, sizeof(ValueType) * valuesPerElement );
 }
 
 template< typename Value,
@@ -252,7 +261,11 @@ void
 DistributedArrayView< Value, Device, Index, Communicator >::
 waitForSynchronization() const
 {
-   // TODO: implement the async stuff
+   if( synchronizer && synchronizer->async_op.valid() ) {
+      synchronizer->async_wait_timer.start();
+      synchronizer->async_op.wait();
+      synchronizer->async_wait_timer.stop();
+   }
 }
 
 
diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h
index 75e958734..32ba735e5 100644
--- a/src/TNL/Containers/Partitioner.h
+++ b/src/TNL/Containers/Partitioner.h
@@ -82,6 +82,14 @@ public:
 
    public:
       using ByteArrayView = typename Base::ByteArrayView;
+      using RequestsVector = typename Base::RequestsVector;
+
+      ~ArraySynchronizer()
+      {
+         // wait for pending async operation, otherwise it would crash
+         if( this->async_op.valid() )
+            this->async_op.wait();
+      }
 
       ArraySynchronizer() = delete;
 
@@ -89,7 +97,13 @@ public:
       : localRange(localRange), overlaps(overlaps), group(group)
       {}
 
-      virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override
+      virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
+      {
+         auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+         Communicator::WaitAll( requests.data(), requests.size() );
+      }
+
+      virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
       {
          TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps),
                         "unexpected array size" );
@@ -122,8 +136,7 @@ public:
                   bytesPerValue * overlaps,
                   right, 0, group ) );
 
-         // wait for all communications to finish
-         Communicator::WaitAll( requests.data(), requests.size() );
+         return requests;
       }
    };
 };
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 225d1a2df..382de6905 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -42,6 +42,14 @@ public:
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
    using CommunicatorType = typename DistributedMesh::CommunicatorType;
    using ByteArrayView = typename Base::ByteArrayView;
+   using RequestsVector = typename Base::RequestsVector;
+
+   ~DistributedMeshSynchronizer()
+   {
+      // wait for pending async operation, otherwise it would crash
+      if( this->async_op.valid() )
+         this->async_op.wait();
+   }
 
    DistributedMeshSynchronizer() = default;
 
@@ -52,12 +60,6 @@ public:
       TNL_ASSERT_EQ( mesh.template getGlobalIndices< EntityDimension >().getSize(), mesh.getLocalMesh().template getEntitiesCount< EntityDimension >(),
                      "Global indices are not allocated properly." );
 
-      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
-      #ifdef HAVE_CUDA
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
-         cudaGetDevice(&this->gpu_id);
-      #endif
-
       group = mesh.getCommunicationGroup();
       const int rank = CommunicatorType::GetRank( group );
       const int nproc = CommunicatorType::GetSize( group );
@@ -127,7 +129,7 @@ public:
 
       // send indices of ghost entities - set them as ghost neighbors on the target rank
       {
-         std::vector< typename CommunicatorType::Request > requests;
+         RequestsVector requests;
 
          // send our ghost indices to the neighboring ranks
          GlobalIndexType ghostOffset = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >();
@@ -196,17 +198,17 @@ public:
       synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement );
    }
 
-   virtual void synchronizeByteArray( ByteArrayView& array, int bytesPerValue ) override
+   virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
+   {
+      auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+      CommunicatorType::WaitAll( requests.data(), requests.size() );
+   }
+
+   virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
    {
       TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ],
                      "The array does not have the expected size." );
 
-      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
-      #ifdef HAVE_CUDA
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
-         cudaSetDevice(gpu_id);
-      #endif
-
       const int rank = CommunicatorType::GetRank( group );
       const int nproc = CommunicatorType::GetSize( group );
 
@@ -214,7 +216,7 @@ public:
       sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > requests;
+      RequestsVector requests;
 
       // issue all receive async operations
       for( int j = 0; j < nproc; j++ ) {
@@ -250,8 +252,7 @@ public:
          }
       }
 
-      // wait for all communications to finish
-      CommunicatorType::WaitAll( requests.data(), requests.size() );
+      return requests;
    }
 
    // performs a synchronization of a sparse matrix
@@ -271,7 +272,7 @@ public:
       const int nproc = CommunicatorType::GetSize( group );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > requests;
+      RequestsVector requests;
 
       Containers::Array< GlobalIndexType, Devices::Host, int > send_rankOffsets( nproc + 1 ), recv_rankOffsets( nproc + 1 );
       Containers::Array< GlobalIndexType, Devices::Host, GlobalIndexType > send_rowCapacities, send_rowPointers, send_columnIndices, recv_rowPointers, recv_columnIndices;
@@ -350,7 +351,7 @@ public:
          // allocate row pointers
          recv_rowPointers.setSize( recv_rankOffsets[ nproc ] + 1 );
 
-         std::vector< typename CommunicatorType::Request > row_lengths_requests;
+         RequestsVector row_lengths_requests;
 
          // set row pointers
          GlobalIndexType rowPtr = 0;
@@ -443,9 +444,6 @@ public:
    }
 
 protected:
-   // GOTCHA (see above)
-   int gpu_id = 0;
-
    // communication group taken from the distributed mesh
    typename CommunicatorType::CommunicationGroup group;
 
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index d201a0a09..f594a081b 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -104,6 +104,7 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal )
    ArrayType globalArray( this->globalSize );
    setLinearSequence( globalArray );
    this->distributedArray.copyFromGlobal( globalArray );
+   this->distributedArray.waitForSynchronization();
 
    const auto localRange = this->distributedArray.getLocalRange();
    ArrayViewType localArrayView;
@@ -151,6 +152,7 @@ TYPED_TEST( DistributedArrayTest, setValue )
    using ArrayType = typename TestFixture::ArrayType;
 
    this->distributedArray.setValue( 1.0 );
+   this->distributedArray.waitForSynchronization();
    ArrayViewType localArrayView = this->distributedArray.getLocalView();
    ArrayType expected( localArrayView.getSize() );
    expected.setValue( 1.0 );
@@ -163,6 +165,7 @@ TYPED_TEST( DistributedArrayTest, setValueGhosts )
    using ArrayType = typename TestFixture::ArrayType;
 
    this->distributedArray.setValue( this->rank );
+   this->distributedArray.waitForSynchronization();
    ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts();
    ArrayType expected( localArrayView.getSize() );
    expected.setValue( this->rank );
@@ -184,6 +187,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    using IndexType = typename TestFixture::IndexType;
 
    this->distributedArray.setValue( 0 );
+   this->distributedArray.waitForSynchronization();
    ArrayViewType localArrayView = this->distributedArray.getLocalView();
    const auto localRange = this->distributedArray.getLocalRange();
 
@@ -214,6 +218,7 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    }
 
    this->distributedArray.setValue( 0 );
+   this->distributedArray.waitForSynchronization();
 
    // use operator[]
    if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
@@ -322,6 +327,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue )
       EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) );
 
    this->distributedArray.setValue( 100 );
+   this->distributedArray.waitForSynchronization();
    EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) );
 }
 
-- 
GitLab


From 9a88469e711e804d98aff30c1538118989ab5df4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 29 Dec 2020 21:11:37 +0100
Subject: [PATCH 35/50] MPI refactoring: split MpiCommunicator into plain
 functions in the TNL::MPI namespace

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |   4 +-
 .../tnl-benchmark-linear-solvers.h            |   4 +-
 .../ODESolvers/tnl-benchmark-ode-solvers.h    |  10 +-
 src/TNL/Communicators/MPITypeResolver.h       | 108 ------
 src/TNL/Communicators/MpiCommunicator.h       | 273 ++------------
 src/TNL/Containers/DistributedArray.hpp       |   1 -
 .../Expressions/DistributedComparison.h       |   2 +-
 .../DistributedVerticalOperations.h           |   2 +-
 src/TNL/MPI.h                                 |  29 ++
 .../MpiDefs.h => MPI/DummyDefs.h}             |  17 +-
 .../{Communicators/MPIPrint.h => MPI/Print.h} |  55 ++-
 .../ScopedInitializer.h                       |  15 +-
 src/TNL/MPI/Utils.h                           |  46 +++
 src/TNL/MPI/Wrappers.h                        | 347 ++++++++++++++++++
 src/TNL/MPI/getDataType.h                     | 119 ++++++
 src/TNL/MPI/selectGPU.h                       |  72 ++++
 .../DistributedMeshes/BufferEntitiesHelper.h  |   1 -
 .../DistributedGridIO_MeshFunction.h          |  71 ++--
 .../DistributedGridSynchronizer.h             |   1 -
 src/TNL/Solvers/Solver_impl.h                 |   6 +-
 src/Tools/tnl-game-of-life.cpp                |   4 +-
 src/Tools/tnl-init.cpp                        |   4 +-
 src/Tools/tnl-test-distributed-mesh.h         |   4 +-
 .../DistributedNDArrayOverlaps_1D_test.h      |   1 -
 .../DistributedNDArrayOverlaps_semi1D_test.h  |   1 -
 .../ndarray/DistributedNDArray_1D_test.h      |   1 -
 .../ndarray/DistributedNDArray_semi1D_test.h  |   1 -
 .../DistributedMeshes/DistributedMeshTest.h   |   1 -
 src/UnitTests/main_mpi.h                      |   4 +-
 29 files changed, 740 insertions(+), 464 deletions(-)
 delete mode 100644 src/TNL/Communicators/MPITypeResolver.h
 create mode 100644 src/TNL/MPI.h
 rename src/TNL/{Communicators/MpiDefs.h => MPI/DummyDefs.h} (64%)
 rename src/TNL/{Communicators/MPIPrint.h => MPI/Print.h} (75%)
 rename src/TNL/{Communicators => MPI}/ScopedInitializer.h (72%)
 create mode 100644 src/TNL/MPI/Utils.h
 create mode 100644 src/TNL/MPI/Wrappers.h
 create mode 100644 src/TNL/MPI/getDataType.h
 create mode 100644 src/TNL/MPI/selectGPU.h

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 74a3205d3..abe08210d 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -20,7 +20,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -309,7 +309,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 06ba2bc94..75b1e0e25 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -25,7 +25,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -592,7 +592,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index aa4370c7a..fcaaaedf2 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -24,7 +24,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
 
@@ -63,7 +63,7 @@ benchmarkODESolvers( Benchmark& benchmark,
 #ifdef HAVE_CUDA
       CudaVectorPointer cuda_u( dofs );
       *cuda_u = 0.0;
-#endif      
+#endif
       if( solver == "euler" || solver == "all" ) {
          using HostSolver = Solvers::ODE::Euler< HostProblem, SolverMonitorType >;
          benchmark.setOperation("Euler");
@@ -168,10 +168,10 @@ bool resolveRealTypes( Benchmark& benchmark,
    Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
-   if( ( realType == "float" || realType == "all" ) && 
+   if( ( realType == "float" || realType == "all" ) &&
        ! resolveIndexType< float >( benchmark, metadata, parameters ) )
       return false;
-   if( ( realType == "double" || realType == "all" ) && 
+   if( ( realType == "double" || realType == "all" ) &&
        ! resolveIndexType< double >( benchmark, metadata, parameters ) )
       return false;
    return true;
@@ -225,7 +225,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/TNL/Communicators/MPITypeResolver.h b/src/TNL/Communicators/MPITypeResolver.h
deleted file mode 100644
index 5429d5e33..000000000
--- a/src/TNL/Communicators/MPITypeResolver.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
-                          MPITypeResolver.h  -  description
-                             -------------------
-    begin                : Feb 4, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Communicators {
-
-#ifdef HAVE_MPI
-template<typename Type>
-struct MPITypeResolver
-{
-   static inline MPI_Datatype getType()
-   {
-      static_assert( sizeof(Type) == sizeof(char) ||
-                     sizeof(Type) == sizeof(int) ||
-                     sizeof(Type) == sizeof(short int) ||
-                     sizeof(Type) == sizeof(long int),
-                     "Fatal Error - Unknown MPI Type");
-      switch( sizeof( Type ) )
-      {
-         case sizeof( char ):
-            return MPI_CHAR;
-         case sizeof( int ):
-            return MPI_INT;
-         case sizeof( short int ):
-            return MPI_SHORT;
-         case sizeof( long int ):
-            return MPI_LONG;
-      }
-      // this will never happen thanks to the static_assert above, but icpc is not that smart
-      // and complains about missing return statement at the end of non-void function
-      throw 0;
-   }
-};
-
-template<> struct MPITypeResolver< char >
-{
-    static inline MPI_Datatype getType(){return MPI_CHAR;};
-};
-
-template<> struct MPITypeResolver< int >
-{
-    static inline MPI_Datatype getType(){return MPI_INT;};
-};
-
-template<> struct MPITypeResolver< short int >
-{
-    static inline MPI_Datatype getType(){return MPI_SHORT;};
-};
-
-template<> struct MPITypeResolver< long int >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG;};
-};
-
-template<> struct MPITypeResolver< unsigned char >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
-};
-
-template<> struct MPITypeResolver< unsigned short int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
-};
-
-template<> struct MPITypeResolver< unsigned int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
-};
-
-template<> struct MPITypeResolver< unsigned long int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
-};
-
-template<> struct MPITypeResolver< float >
-{
-    static inline MPI_Datatype getType(){return MPI_FLOAT;};
-};
-
-template<> struct MPITypeResolver< double >
-{
-    static inline MPI_Datatype getType(){return MPI_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< long double >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< bool >
-{
-   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
-   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
-   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
-};
-#endif
-
-} // namespace Communicators
-} // namespace TNL
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index dedc35f03..1995978c5 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -11,36 +11,23 @@
 #pragma once
 
 #include <iostream>
-#include <fstream>
-#include <cstring>
 
 #ifdef HAVE_MPI
-#include <mpi.h>
 #ifdef OMPI_MAJOR_VERSION
    // header specific to OpenMPI (needed for CUDA-aware detection)
    #include <mpi-ext.h>
 #endif
 
 #include <unistd.h>  // getpid
-
-#ifdef HAVE_CUDA
-    #include <TNL/Cuda/CheckDevice.h>
-
-    typedef struct __attribute__((__packed__))  {
-       char name[MPI_MAX_PROCESSOR_NAME];
-    } procName;
-#endif
-
 #endif
 
 #include <TNL/String.h>
 #include <TNL/Logger.h>
-#include <TNL/Debugging/OutputRedirection.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/Wrappers.h>
+#include <TNL/MPI/DummyDefs.h>
+#include <TNL/MPI/Utils.h>
 #include <TNL/Config/ConfigDescription.h>
-#include <TNL/Exceptions/MPISupportMissing.h>
 #include <TNL/Exceptions/MPIDimsCreateError.h>
-#include <TNL/Communicators/MPITypeResolver.h>
 
 
 namespace TNL {
@@ -88,7 +75,7 @@ class MpiCommunicator
             const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
             const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
             if( redirect )
-               setupRedirection( outputDirectory );
+               MPI::setupRedirection( outputDirectory );
 #ifdef HAVE_CUDA
             int size;
             MPI_Comm_size( MPI_COMM_WORLD, &size );
@@ -144,125 +131,32 @@ class MpiCommunicator
 
       static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
       {
-#ifdef HAVE_MPI
-         switch( required_thread_level ) {
-            case MPI_THREAD_SINGLE:
-            case MPI_THREAD_FUNNELED:
-            case MPI_THREAD_SERIALIZED:
-            case MPI_THREAD_MULTIPLE:
-               break;
-            default:
-               printf("ERROR: invalid argument for the 'required' thread level support: %d\n", required_thread_level);
-               MPI_Abort(MPI_COMM_WORLD, 1);
-         }
-
-         int provided;
-         MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
-         if( provided < required_thread_level ) {
-            const char* level = "";
-            switch( required_thread_level ) {
-               case MPI_THREAD_SINGLE:
-                  level = "MPI_THREAD_SINGLE";
-                  break;
-               case MPI_THREAD_FUNNELED:
-                  level = "MPI_THREAD_FUNNELED";
-                  break;
-               case MPI_THREAD_SERIALIZED:
-                  level = "MPI_THREAD_SERIALIZED";
-                  break;
-               case MPI_THREAD_MULTIPLE:
-                  level = "MPI_THREAD_MULTIPLE";
-                  break;
-            }
-            printf("ERROR: The MPI library does not have the required level of thread support: %s\n", level);
-            MPI_Abort(MPI_COMM_WORLD, 1);
-         }
-
-         selectGPU();
-#endif
+         MPI::Init( argc, argv, required_thread_level );
 
          // silence warnings about (potentially) unused variables
          (void) NullGroup;
-         (void) NullRequest;
-      }
-
-      static void setupRedirection( std::string outputDirectory )
-      {
-#ifdef HAVE_MPI
-         if(isDistributed() )
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank(AllGroup)) + ".txt";
-               const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank(AllGroup)) + ".txt";
-               std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
-               Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
-            }
-         }
-#endif
       }
 
       static void Finalize()
       {
-#ifdef HAVE_MPI
-         if(isDistributed())
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               // restore redirection (not necessary, it uses RAII internally...)
-               Debugging::redirect_stdout_stderr( "", "", true );
-            }
-         }
-         MPI_Finalize();
-#endif
+         MPI::Finalize();
       }
 
       static bool IsInitialized()
       {
-#ifdef HAVE_MPI
-         int initialized, finalized;
-         MPI_Initialized(&initialized);
-         MPI_Finalized(&finalized);
-         return initialized && !finalized;
-#else
-         return true;
-#endif
+         return MPI::isInitialized();
       }
 
       static int GetRank(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetRank cannot be called with NullGroup");
-         int rank;
-         MPI_Comm_rank(group,&rank);
-         return rank;
-#else
-         return 0;
-#endif
+         return MPI::GetRank( group );
       }
 
       static int GetSize(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetSize cannot be called with NullGroup");
-         int size;
-         MPI_Comm_size(group,&size);
-         return size;
-#else
-         return 1;
-#endif
+         return MPI::GetSize( group );
       }
 
-#ifdef HAVE_MPI
-      template< typename T >
-      static MPI_Datatype getDataType( const T& t )
-      {
-         return MPITypeResolver< T >::getType();
-      }
-#endif
-
       //dim-number of dimensions, distr array of guess distr - 0 for computation
       //distr array will be filled by computed distribution
       //more information in MPI documentation
@@ -291,78 +185,42 @@ class MpiCommunicator
 
       static void Barrier( CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Barrier cannot be called with NullGroup");
-         MPI_Barrier(group);
-#endif
+         MPI::Barrier( group );
       }
 
       template <typename T>
       static void Send( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup");
-         MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group );
-#endif
+         MPI::Send( data, count, dest, tag, group );
       }
 
       template <typename T>
       static void Recv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status );
-#endif
-     }
+         MPI::Recv( data, count, src, tag, group );
+      }
 
       template <typename T>
       static Request ISend( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "ISend cannot be called with NullGroup");
-         Request req;
-         MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req);
-         return req;
-#else
-         return 1;
-#endif
+         return MPI::Isend( data, count, dest, tag, group );
       }
 
       template <typename T>
       static Request IRecv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "IRecv cannot be called with NullGroup");
-         Request req;
-         MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req);
-         return req;
-#else
-         return 1;
-#endif
+         return MPI::Irecv( data, count, src, tag, group );
       }
 
       static void WaitAll(Request *reqs, int length)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE);
-#endif
+         MPI::Waitall( reqs, length );
       }
 
       template< typename T >
       static void Bcast( T* data, int count, int root, CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
-         MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group);
-#endif
+         MPI::Bcast( data, count, root, group );
       }
 
       template< typename T >
@@ -372,12 +230,7 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group);
-#else
-         memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
-#endif
+         MPI::Allreduce( data, reduced_data, count, op, group );
       }
 
       // in-place variant of Allreduce
@@ -387,27 +240,18 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group);
-#endif
+         MPI::Allreduce( data, count, op, group );
       }
 
-
       template< typename T >
       static void Reduce( const T* data,
                           T* reduced_data,
                           int count,
-                          MPI_Op &op,
+                          const MPI_Op &op,
                           int root,
                           CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup");
-         MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group);
-#else
-         memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) );
-#endif
+         MPI::Reduce( data, reduced_data, count, op, root, group );
       }
 
       template< typename T >
@@ -421,24 +265,7 @@ class MpiCommunicator
                                int receiveTag,
                                CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Sendrecv( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       destination,
-                       sendTag,
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       source,
-                       receiveTag,
-                       group,
-                       &status );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Sendrecv( sendData, sendCount, destination, sendTag, receiveData, receiveCount, source, receiveTag, group );
       }
 
       template< typename T >
@@ -448,19 +275,7 @@ class MpiCommunicator
                             int receiveCount,
                             CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Alltoall( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       group );
-#else
-         TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
-         memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) );
-#endif
+         MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
 
@@ -485,58 +300,16 @@ class MpiCommunicator
       }
 
 #ifdef HAVE_MPI
-      static MPI_Request NullRequest;
       static MPI_Comm AllGroup;
       static MPI_Comm NullGroup;
 #else
-      static constexpr int NullRequest = -1;
       static constexpr int AllGroup = 1;
       static constexpr int NullGroup = 0;
 #endif
    private:
-
-      static void selectGPU(void)
-      {
-#ifdef HAVE_MPI
-    #ifdef HAVE_CUDA
-         const int count = GetSize(AllGroup);
-         const int rank = GetRank(AllGroup);
-         int gpuCount;
-         cudaGetDeviceCount(&gpuCount);
-
-         procName names[count];
-
-         int i=0;
-         int len;
-         MPI_Get_processor_name(names[rank].name, &len);
-
-         for(i=0;i<count;i++)
-            std::memcpy(names[i].name,names[rank].name,len+1);
-
-         MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-            (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-                     MPI_COMM_WORLD);
-
-         int nodeRank=0;
-         for(i=0;i<rank;i++)
-         {
-            if(std::strcmp(names[rank].name,names[i].name)==0)
-               nodeRank++;
-         }
-
-         const int gpuNumber = nodeRank % gpuCount;
-
-         cudaSetDevice(gpuNumber);
-         TNL_CHECK_CUDA_DEVICE;
-
-         //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
-    #endif
-#endif
-      }
 };
 
 #ifdef HAVE_MPI
-MPI_Request MpiCommunicator::NullRequest = MPI_REQUEST_NULL;
 MPI_Comm MpiCommunicator::AllGroup = MPI_COMM_WORLD;
 MPI_Comm MpiCommunicator::NullGroup = MPI_COMM_NULL;
 #endif
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index cd0eb49d5..61dc3eda0 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -15,7 +15,6 @@
 #include "DistributedArray.h"
 
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Communicators/MpiDefs.h>  // important only when MPI is disabled
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h
index 1cef0873d..2695ccccc 100644
--- a/src/TNL/Containers/Expressions/DistributedComparison.h
+++ b/src/TNL/Containers/Expressions/DistributedComparison.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/DummyDefs.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index b525e8a53..f55ae3d4a 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/VerticalOperations.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/DummyDefs.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h
new file mode 100644
index 000000000..b1b7dd698
--- /dev/null
+++ b/src/TNL/MPI.h
@@ -0,0 +1,29 @@
+/***************************************************************************
+                          MPI.h  -  description
+                             -------------------
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+/**
+ * \brief A convenient header file which includes all headers from the
+ * `TNL/MPI/` subdirectory.
+ *
+ * Users may use this to avoid having to include many header files in their
+ * projects. On the other hand, parts of the TNL library should generally
+ * include only the specific headers they need, in order to avoid cycles in
+ * the header inclusion.
+ */
+
+#include "MPI/DummyDefs.h"
+#include "MPI/getDataType.h"
+#include "MPI/selectGPU.h"
+#include "MPI/Wrappers.h"
+#include "MPI/Utils.h"
+#include "MPI/ScopedInitializer.h"
+#include "MPI/Print.h"
diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/MPI/DummyDefs.h
similarity index 64%
rename from src/TNL/Communicators/MpiDefs.h
rename to src/TNL/MPI/DummyDefs.h
index df43005ec..cdd5ea483 100644
--- a/src/TNL/Communicators/MpiDefs.h
+++ b/src/TNL/MPI/DummyDefs.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          MpiCommunicator.h  -  description
+                          MPI/DummyDefs.h  -  description
                              -------------------
-    begin                : 2005/04/23
-    copyright            : (C) 2005 by Tomas Oberhuber
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -11,6 +11,9 @@
 #pragma once
 
 #ifndef HAVE_MPI
+using MPI_Request = int;
+using MPI_Comm = int;
+
 enum MPI_Op {
    MPI_MAX,
    MPI_MIN,
@@ -28,9 +31,9 @@ enum MPI_Op {
 
 // MPI_Init_thread constants
 enum {
-  MPI_THREAD_SINGLE,
-  MPI_THREAD_FUNNELED,
-  MPI_THREAD_SERIALIZED,
-  MPI_THREAD_MULTIPLE
+   MPI_THREAD_SINGLE,
+   MPI_THREAD_FUNNELED,
+   MPI_THREAD_SERIALIZED,
+   MPI_THREAD_MULTIPLE
 };
 #endif
diff --git a/src/TNL/Communicators/MPIPrint.h b/src/TNL/MPI/Print.h
similarity index 75%
rename from src/TNL/Communicators/MPIPrint.h
rename to src/TNL/MPI/Print.h
index 6d78eafaf..5cd4819a2 100644
--- a/src/TNL/Communicators/MPIPrint.h
+++ b/src/TNL/MPI/Print.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          MPIPrint.h  -  description
+                          MPI/Print.h  -  description
                              -------------------
     begin                : Feb 7, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -10,34 +10,35 @@
 
 #pragma once
 
+#include <iostream>
 #include <sstream>
-#include <TNL/Communicators/MpiCommunicator.h>
+
+#include <TNL/String.h>
+#include <TNL/MPI/Wrappers.h>
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT( message )                                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       std::stringstream __tnl_mpi_print_stream_;                                                                                 \
-      __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                             \
-         << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                     \
+      __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                        \
+                              << message << std::endl;                                                                           \
       TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                      \
       mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                                  \
    }                                                                                                                             \
    else                                                                                                                          \
    {                                                                                                                             \
-      std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;              \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
-         {                                                                                                                       \
-            TNL::String __tnl_mpi_print_string_;                                                                                 \
-            mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                         \
-            std::cerr << __tnl_mpi_print_string_;                                                                                \
-         }                                                                                                                       \
+      std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                         \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
+      {                                                                                                                          \
+         TNL::String __tnl_mpi_print_string_;                                                                                    \
+         mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                            \
+         std::cerr << __tnl_mpi_print_string_;                                                                                   \
+      }                                                                                                                          \
    }                                                                                                                             \
 }
 #else
@@ -47,11 +48,11 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_MASTER( message )                                                                                          \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() == 0 )                                                                     \
+   if( TNL::MPI::GetRank() == 0 )                                                                     \
    {                                                                                                                             \
       std::cerr << "Master node : " << message << std::endl;                                                                     \
    }                                                                                                                             \
@@ -63,20 +64,20 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_COND( condition, message )                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
 {                                                                                                                                \
    if( condition) std::cerr << message << std::endl;                                                                             \
 }                                                                                                                                \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       int __tnl_mpi_print_cnd = ( condition );                                                                                   \
-      TNL::Communicators::MpiCommunicator::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                \
+      TNL::MPI::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                                           \
       if( condition ) {                                                                                                          \
          std::stringstream __tnl_mpi_print_stream_;                                                                              \
-         __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                          \
-            << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                  \
+         __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                     \
+                                 << message << std::endl;                                                                        \
          TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                   \
          mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                               \
       }                                                                                                                          \
@@ -84,13 +85,11 @@ else
    else                                                                                                                          \
    {                                                                                                                             \
       if( condition )                                                                                                            \
-         std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;           \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
+         std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                      \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
          {                                                                                                                       \
             int __tnl_mpi_print_cond;                                                                                            \
-            TNL::Communicators::MpiCommunicator::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                         \
+            TNL::MPI::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                                                    \
             if( __tnl_mpi_print_cond )                                                                                           \
             {                                                                                                                    \
                TNL::String __tnl_mpi_print_string_;                                                                              \
diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h
similarity index 72%
rename from src/TNL/Communicators/ScopedInitializer.h
rename to src/TNL/MPI/ScopedInitializer.h
index 2970bc628..82ba02bc5 100644
--- a/src/TNL/Communicators/ScopedInitializer.h
+++ b/src/TNL/MPI/ScopedInitializer.h
@@ -12,22 +12,25 @@
 
 #pragma once
 
+#include "Wrappers.h"
+#include "Utils.h"
+
 namespace TNL {
-namespace Communicators {
+namespace MPI {
 
-template< typename Communicator >
 struct ScopedInitializer
 {
-   ScopedInitializer( int& argc, char**& argv )
+   ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
    {
-      Communicator::Init( argc, argv );
+      Init( argc, argv );
    }
 
    ~ScopedInitializer()
    {
-      Communicator::Finalize();
+      restoreRedirection();
+      Finalize();
    }
 };
 
-} // namespace Communicators
+} // namespace MPI
 } // namespace TNL
diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h
new file mode 100644
index 000000000..b655aefd0
--- /dev/null
+++ b/src/TNL/MPI/Utils.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Debugging/OutputRedirection.h>
+
+#include "Wrappers.h"
+
+namespace TNL {
+namespace MPI {
+
+inline bool isInitialized()
+{
+   return Initialized() && ! Finalized();
+}
+
+inline void setupRedirection( std::string outputDirectory )
+{
+#ifdef HAVE_MPI
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank()) + ".txt";
+      const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank()) + ".txt";
+      std::cout << GetRank() << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
+      Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
+   }
+#endif
+}
+
+// restore redirection (usually not necessary, it uses RAII internally...)
+inline void restoreRedirection()
+{
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      Debugging::redirect_stdout_stderr( "", "", true );
+   }
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
new file mode 100644
index 000000000..9a057da5f
--- /dev/null
+++ b/src/TNL/MPI/Wrappers.h
@@ -0,0 +1,347 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#else
+   #include "DummyDefs.h"
+   #include <cstring>  // std::memcpy
+   #include <TNL/Exceptions/MPISupportMissing.h>
+#endif
+
+#include <TNL/Assert.h>
+#include "getDataType.h"
+#include "selectGPU.h"
+
+namespace TNL {
+namespace MPI {
+
+// function wrappers for MPI constants
+
+inline MPI_Comm AllGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_WORLD;
+#else
+   return 1;
+#endif
+}
+
+inline MPI_Comm NullGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_NULL;
+#else
+   return 0;
+#endif
+}
+
+inline MPI_Request NullRequest()
+{
+#ifdef HAVE_MPI
+   return MPI_REQUEST_NULL;
+#else
+   return 0;
+#endif
+}
+
+// wrappers for basic MPI functions
+
+inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
+{
+#ifdef HAVE_MPI
+   switch( required_thread_level ) {
+      case MPI_THREAD_SINGLE:
+      case MPI_THREAD_FUNNELED:
+      case MPI_THREAD_SERIALIZED:
+      case MPI_THREAD_MULTIPLE:
+         break;
+      default:
+         std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl;
+         MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   int provided;
+   MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
+   if( provided < required_thread_level ) {
+      const char* level = "";
+      switch( required_thread_level ) {
+         case MPI_THREAD_SINGLE:
+            level = "MPI_THREAD_SINGLE";
+            break;
+         case MPI_THREAD_FUNNELED:
+            level = "MPI_THREAD_FUNNELED";
+            break;
+         case MPI_THREAD_SERIALIZED:
+            level = "MPI_THREAD_SERIALIZED";
+            break;
+         case MPI_THREAD_MULTIPLE:
+            level = "MPI_THREAD_MULTIPLE";
+            break;
+      }
+      std::cerr << "ERROR: The MPI library does not have the required level of thread support: " << level << std::endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   selectGPU();
+#endif
+}
+
+inline void Finalize()
+{
+#ifdef HAVE_MPI
+   MPI_Finalize();
+#endif
+}
+
+inline bool Initialized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Initialized(&flag);
+    return flag;
+#else
+    return true;
+#endif
+}
+
+inline bool Finalized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Finalized(&flag);
+    return flag;
+#else
+    return false;
+#endif
+}
+
+inline int GetRank( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetRank cannot be called with NullGroup" );
+   int rank;
+   MPI_Comm_rank( group, &rank );
+   return rank;
+#else
+   return 0;
+#endif
+}
+
+inline int GetSize( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetSize cannot be called with NullGroup" );
+   int size;
+   MPI_Comm_size( group, &size );
+   return size;
+#else
+   return 1;
+#endif
+}
+
+// wrappers for MPI communication functions
+
+inline void Barrier( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Barrier cannot be called with NullGroup" );
+   MPI_Barrier(group);
+#endif
+}
+
+inline void Waitall( MPI_Request* reqs, int length )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   MPI_Waitall( length, reqs, MPI_STATUSES_IGNORE );
+#endif
+}
+
+template< typename T >
+void Send( const T* data,
+           int count,
+           int dest,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Send cannot be called with NullGroup" );
+   MPI_Send( (const void*) data, count, getDataType<T>(), dest, tag, group );
+#endif
+}
+
+template< typename T >
+void Recv( T* data,
+           int count,
+           int src,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Recv cannot be called with NullGroup" );
+   MPI_Recv( (void*) data, count, getDataType<T>(), src, tag, group, MPI_STATUS_IGNORE );
+#endif
+}
+
+template< typename T >
+void Sendrecv( const T* sendData,
+               int sendCount,
+               int destination,
+               int sendTag,
+               T* receiveData,
+               int receiveCount,
+               int source,
+               int receiveTag,
+               MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Sendrecv cannot be called with NullGroup" );
+   MPI_Sendrecv( (void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 destination,
+                 sendTag,
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 source,
+                 receiveTag,
+                 group,
+                 MPI_STATUS_IGNORE );
+#else
+   throw Exceptions::MPISupportMissing();
+#endif
+}
+
+template< typename T >
+MPI_Request Isend( const T* data,
+                   int count,
+                   int dest,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Isend cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Isend( (const void*) data, count, getDataType<T>(), dest, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+MPI_Request Irecv( T* data,
+                   int count,
+                   int src,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Irecv cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Irecv( (void*) data, count, getDataType<T>(), src, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+void Allreduce( const T* data,
+                T* reduced_data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, group );
+#else
+   std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) );
+#endif
+}
+
+// in-place variant of Allreduce
+template< typename T >
+void Allreduce( T* data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType<T>(), op, group );
+#endif
+}
+
+template< typename T >
+void Reduce( const T* data,
+             T* reduced_data,
+             int count,
+             const MPI_Op& op,
+             int root,
+             MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Reduce cannot be called with NullGroup" );
+   MPI_Reduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, root, group );
+#else
+   std::memcpy( (void*) reduced_data, (void*) data, count * sizeof(T) );
+#endif
+}
+
+template< typename T >
+void Bcast( T* data, int count, int root, MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Bcast cannot be called with NullGroup" );
+   MPI_Bcast( (void*) data, count, getDataType<T>(), root, group );
+#endif
+}
+
+template< typename T >
+void Alltoall( const T* sendData,
+               int sendCount,
+               T* receiveData,
+               int receiveCount,
+               MPI_Comm group )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Alltoall cannot be called with NullGroup" );
+   MPI_Alltoall( (const void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 group );
+#else
+   TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
+   std::memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof(T) );
+#endif
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/getDataType.h b/src/TNL/MPI/getDataType.h
new file mode 100644
index 000000000..f3570679b
--- /dev/null
+++ b/src/TNL/MPI/getDataType.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          getDataType.h  -  description
+                             -------------------
+    begin                : Feb 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#endif
+
+namespace TNL {
+namespace MPI {
+
+#ifdef HAVE_MPI
+template< typename T >
+struct TypeResolver
+{
+   static inline MPI_Datatype getType()
+   {
+      static_assert( sizeof(T) == sizeof(char) ||
+                     sizeof(T) == sizeof(int) ||
+                     sizeof(T) == sizeof(short int) ||
+                     sizeof(T) == sizeof(long int),
+                     "Fatal Error - Unknown MPI Type");
+      switch( sizeof(T) )
+      {
+         case sizeof(char):
+            return MPI_CHAR;
+         case sizeof(int):
+            return MPI_INT;
+         case sizeof(short int):
+            return MPI_SHORT;
+         case sizeof(long int):
+            return MPI_LONG;
+      }
+      // This will never happen thanks to the static_assert above, but icpc is
+      // not that smart and complains about missing return statement at the end
+      // of non-void function.
+      throw 0;
+   }
+};
+
+template<> struct TypeResolver< char >
+{
+   static inline MPI_Datatype getType(){return MPI_CHAR;};
+};
+
+template<> struct TypeResolver< int >
+{
+   static inline MPI_Datatype getType(){return MPI_INT;};
+};
+
+template<> struct TypeResolver< short int >
+{
+   static inline MPI_Datatype getType(){return MPI_SHORT;};
+};
+
+template<> struct TypeResolver< long int >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG;};
+};
+
+template<> struct TypeResolver< unsigned char >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
+};
+
+template<> struct TypeResolver< unsigned short int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
+};
+
+template<> struct TypeResolver< unsigned int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
+};
+
+template<> struct TypeResolver< unsigned long int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
+};
+
+template<> struct TypeResolver< float >
+{
+   static inline MPI_Datatype getType(){return MPI_FLOAT;};
+};
+
+template<> struct TypeResolver< double >
+{
+   static inline MPI_Datatype getType(){return MPI_DOUBLE;};
+};
+
+template<> struct TypeResolver< long double >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
+};
+
+template<> struct TypeResolver< bool >
+{
+   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
+   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
+   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
+};
+
+template< typename T >
+MPI_Datatype getDataType( const T& = T{} )
+{
+   return TypeResolver< T >::getType();
+}
+#endif
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h
new file mode 100644
index 000000000..def9a329f
--- /dev/null
+++ b/src/TNL/MPI/selectGPU.h
@@ -0,0 +1,72 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <cstring>
+
+#include <TNL/Cuda/CheckDevice.h>
+
+namespace TNL {
+namespace MPI {
+namespace {
+
+#ifdef HAVE_MPI
+#ifdef HAVE_CUDA
+   typedef struct __attribute__((__packed__)) {
+      char name[MPI_MAX_PROCESSOR_NAME];
+   } procName;
+#endif
+#endif
+
+inline void selectGPU()
+{
+#ifdef HAVE_MPI
+#ifdef HAVE_CUDA
+   int size;
+   MPI_Comm_size( MPI_COMM_WORLD, &size );
+   int rank;
+   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+   int gpuCount;
+   cudaGetDeviceCount( &gpuCount );
+
+   procName names[size];
+
+   int i=0;
+   int len;
+   MPI_Get_processor_name(names[rank].name, &len);
+
+   for(i=0;i<size;i++)
+      std::memcpy(names[i].name,names[rank].name,len+1);
+
+   MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
+      (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
+               MPI_COMM_WORLD);
+
+   int nodeRank=0;
+   for(i=0;i<rank;i++)
+   {
+      if(std::strcmp(names[rank].name,names[i].name)==0)
+         nodeRank++;
+   }
+
+   const int gpuNumber = nodeRank % gpuCount;
+
+   cudaSetDevice(gpuNumber);
+   TNL_CHECK_CUDA_DEVICE;
+
+   //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
+#endif
+#endif
+}
+
+} // namespace <unnamed>
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
index 6030b976f..04647cb4a 100644
--- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -12,7 +12,6 @@
 
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Communicators/MPIPrint.h>
 
 namespace TNL {
 namespace Meshes {
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
index 60605c6eb..99f505bba 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -12,6 +12,7 @@
 
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/MPI/getDataType.h>
 
 namespace TNL {
 namespace Meshes {
@@ -19,7 +20,7 @@ namespace DistributedMeshes {
 
 
 /*
- * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. 
+ * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap.
  * It is slow and has high RAM consumption
  */
 template< typename MeshFunction,
@@ -88,8 +89,8 @@ class DistributedGridIO<
          return true;
 
       };
-            
-    static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+
+    static bool load(const String& fileName,MeshFunctionType &meshFunction)
     {
         auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
         if(distrGrid==NULL) //not distributed
@@ -99,10 +100,10 @@ class DistributedGridIO<
         }
 
         const MeshType& mesh=meshFunction.getMesh();
-        
+
         PointType spaceSteps=mesh.getSpaceSteps();
         PointType origin=mesh.getOrigin();
-                
+
         CoordinatesType localSize=distrGrid->getLocalSize();
         CoordinatesType localBegin=distrGrid->getLocalBegin();
 
@@ -111,33 +112,33 @@ class DistributedGridIO<
         newMesh->setSpaceSteps(spaceSteps);
         CoordinatesType newOrigin;
         newMesh->setOrigin(origin+spaceSteps*localBegin);
-        
+
         VectorType newDof(newMesh-> template getEntitiesCount< typename MeshType::Cell >());
         MeshFunctionType newMeshFunction;
-        newMeshFunction.bind(newMesh,newDof); 
+        newMeshFunction.bind(newMesh,newDof);
 
         CoordinatesType zeroCoord;
-        zeroCoord.setValue(0);        
+        zeroCoord.setValue(0);
 
         File file;
         file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), std::ios_base::in );
         newMeshFunction.boundLoad(file);
         file.close();
         CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
-        
+
         return true;
     };
-    
+
 };
 
 /*
- * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, 
+ * Save distributed data into single file without overlaps using MPIIO and MPI datatypes,
  * EXPLOSIVE: works with only Grids and MPI
  * BAD IMPLEMENTTION creating MPI-Types at every save! -- I dont want contamine more places by MPI..
  */
 
 #ifdef HAVE_MPI
-template<typename MeshFunctionType> 
+template<typename MeshFunctionType>
 class DistributedGridIO_MPIIOBase
 {
    public:
@@ -152,7 +153,7 @@ class DistributedGridIO_MPIIOBase
     static bool save(const String& fileName, MeshFunctionType &meshFunction, RealType *data)
     {
 		auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
-        
+
         if(distrGrid==NULL) //not distributed
         {
             meshFunction.save(fileName);
@@ -168,7 +169,7 @@ class DistributedGridIO_MPIIOBase
                       &file);
       if( ok != 0 )
          throw std::runtime_error("Open file falied");
-      
+
 		int written=save(file,meshFunction, data,0);
 
         MPI_File_close(&file);
@@ -176,7 +177,7 @@ class DistributedGridIO_MPIIOBase
 		return written>0;
 
 	};
-    
+
     static int save(MPI_File &file, MeshFunctionType &meshFunction, RealType *data, int offset)
     {
 
@@ -187,7 +188,7 @@ class DistributedGridIO_MPIIOBase
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
 
        int headerSize;
-	   
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
        if(Communicators::MpiCommunicator::GetRank(group)==0)
@@ -200,9 +201,9 @@ class DistributedGridIO_MPIIOBase
 	   offset +=headerSize;
 
        MPI_File_set_view(file,offset,
-               Communicators::MPITypeResolver<RealType>::getType(),
+               TNL::MPI::getDataType<RealType>(),
                ftype,"native",MPI_INFO_NULL);
-       
+
        MPI_Status wstatus;
 
        MPI_File_write(file,data,1,atype,&wstatus);
@@ -222,7 +223,7 @@ class DistributedGridIO_MPIIOBase
         int fstarts[dim];
         int flsize[dim];
         int fgsize[dim];
-        
+
         hackArray(dim,fstarts,distrGrid->getGlobalBegin().getData());
         hackArray(dim,flsize,distrGrid->getLocalSize().getData());
         hackArray(dim,fgsize,distrGrid->getGlobalSize().getData());
@@ -230,14 +231,14 @@ class DistributedGridIO_MPIIOBase
         MPI_Type_create_subarray(dim,
             fgsize,flsize,fstarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype);
 
         MPI_Type_commit(ftype);
 
        int agsize[dim];
        int alsize[dim];
-       int astarts[dim]; 
+       int astarts[dim];
 
        hackArray(dim,astarts,distrGrid->getLocalBegin().getData());
        hackArray(dim,alsize,distrGrid->getLocalSize().getData());
@@ -246,7 +247,7 @@ class DistributedGridIO_MPIIOBase
        MPI_Type_create_subarray(dim,
             agsize,alsize,astarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             atype);
        MPI_Type_commit(atype);
 
@@ -350,9 +351,9 @@ class DistributedGridIO_MPIIOBase
       MPI_File_close(&file);
       return ret;
    }
-            
+
     /* Funky bomb - no checks - only dirty load */
-    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) 
+    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset )
     {
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
 
@@ -360,7 +361,7 @@ class DistributedGridIO_MPIIOBase
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
-       
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
        int headerSize=0;
@@ -371,18 +372,18 @@ class DistributedGridIO_MPIIOBase
             headerSize=readMeshFunctionHeader(file,meshFunction,dataCount);
        }
        MPI_Bcast(&headerSize, 1, MPI_INT,0, group);
-       
+
        if(headerSize<0)
             return false;
 
        offset+=headerSize;
 
        MPI_File_set_view(file,offset,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype,"native",MPI_INFO_NULL);
        MPI_Status wstatus;
        MPI_File_read(file,(void*)data,1,atype,&wstatus);
-        
+
        MPI_Type_free(&atype);
        MPI_Type_free(&ftype);
 
@@ -412,7 +413,7 @@ class DistributedGridIO_MPIIOBase
         size+=count*sizeof(char);
         MPI_File_read(file, (void *)&count,1, MPI_INT, &rstatus);//DATACOUNT
         size+=1*sizeof(int);
-        
+
         if(count!=length)
         {
             std::cerr<<"Chyba načítání MeshFunction, délka dat v souboru neodpovídá očekávané délce" << std::endl;
@@ -421,7 +422,7 @@ class DistributedGridIO_MPIIOBase
 
         return size;
     };
-    
+
 };
 #endif
 
@@ -444,10 +445,10 @@ class DistributedGridIO<
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector=meshFunction.getData();
-            typename MeshFunctionType::RealType * data=hostVector.getData();  
+            typename MeshFunctionType::RealType * data=hostVector.getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
@@ -455,12 +456,12 @@ class DistributedGridIO<
          return false;
       };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector.setLike(meshFunction.getData());
             auto* data=hostVector.getData();
@@ -501,7 +502,7 @@ class DistributedGridIO<
          return false;
     };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
index 5a1150240..7bc17f920 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
@@ -16,7 +16,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h>
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Pointers/SharedPointer.h>
 
 namespace TNL {
diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h
index 9182c620f..5c35c7c33 100644
--- a/src/TNL/Solvers/Solver_impl.h
+++ b/src/TNL/Solvers/Solver_impl.h
@@ -16,11 +16,11 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 namespace TNL {
 namespace Solvers {
-   
+
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename MeshConfig, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
           template< typename MeshConfig > class ProblemConfig,
           typename MeshConfig >
@@ -37,7 +37,7 @@ run( int argc, char* argv[] )
    Devices::Cuda::configSetup( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv );
+   TNL::MPI::ScopedInitializer mpi( argc, argv );
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return false;
diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp
index c33ae8294..a2d4f48e9 100644
--- a/src/Tools/tnl-game-of-life.cpp
+++ b/src/Tools/tnl-game-of-life.cpp
@@ -18,7 +18,7 @@
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 using namespace TNL;
 
@@ -361,7 +361,7 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index 1a7769b5c..73765aafb 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -16,7 +16,7 @@
 #include <TNL/Meshes/Grid.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 
 using namespace TNL;
@@ -55,7 +55,7 @@ int main( int argc, char* argv[] )
    setupConfig( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h
index 0be53242b..1b8c59c75 100644
--- a/src/Tools/tnl-test-distributed-mesh.h
+++ b/src/Tools/tnl-test-distributed-mesh.h
@@ -19,7 +19,7 @@
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 using namespace TNL;
 
@@ -431,7 +431,7 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 113d1daa3..366535cc7 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index 145b0db5b..aba9420f0 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index d80e467f5..3c637de4d 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index a072b2e80..93d6c3036 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index 7decaf575..b778937b6 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -18,7 +18,6 @@
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Meshes/Readers/PVTUReader.h>
diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h
index 0f8f4b059..4c89b60ba 100644
--- a/src/UnitTests/main_mpi.h
+++ b/src/UnitTests/main_mpi.h
@@ -7,7 +7,7 @@
 
 #if (defined(HAVE_GTEST) && defined(HAVE_MPI))
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 using CommunicatorType = TNL::Communicators::MpiCommunicator;
 
 #include <sstream>
@@ -58,7 +58,7 @@ int main( int argc, char* argv[] )
       delete listeners.Release(listeners.default_result_printer());
       listeners.Append(new MinimalistBufferedPrinter);
 
-      TNL::Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+      TNL::MPI::ScopedInitializer mpi(argc, argv);
    #endif
    return RUN_ALL_TESTS();
 #else
-- 
GitLab


From 5375835239bf201a79e637d767e4020ff344eb46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 29 Dec 2020 22:38:03 +0100
Subject: [PATCH 36/50] MPI: added getRankOnNode and removed
 MPI_Get_processor_name from selectGPU

---
 Documentation/Pages/main-page.md |  6 ++---
 src/TNL/MPI/Utils.h              | 30 +++++++++++++++++++++
 src/TNL/MPI/Wrappers.h           |  7 ++++-
 src/TNL/MPI/selectGPU.h          | 45 ++++----------------------------
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/Documentation/Pages/main-page.md b/Documentation/Pages/main-page.md
index db9aceccb..5693f92a0 100644
--- a/Documentation/Pages/main-page.md
+++ b/Documentation/Pages/main-page.md
@@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries.
     - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for
       computations on Nvidia GPUs.
     - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can
-      use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for
-      distributed computing. For distributed CUDA computations, the library must
-      be [CUDA-aware](
+      a library implementing the MPI-3 standard for distributed computing (e.g.
+      [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations,
+      the library must be [CUDA-aware](
       https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/).
 
 - __Libraries:__
diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h
index b655aefd0..d334aaf5b 100644
--- a/src/TNL/MPI/Utils.h
+++ b/src/TNL/MPI/Utils.h
@@ -42,5 +42,35 @@ inline void restoreRedirection()
    }
 }
 
+/**
+ * \brief Returns a local rank ID of the current process within a group of
+ * processes running on a shared-memory node.
+ *
+ * The given MPI communicator is split into groups according to the
+ * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process
+ * within the group is returned.
+ */
+inline int getRankOnNode( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   const int rank = GetRank(group);
+
+   MPI_Info info;
+   MPI_Info_create( &info );
+
+   MPI_Comm local_comm;
+   MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm );
+
+   const int local_rank = GetRank( local_comm );
+
+   MPI_Comm_free(&local_comm);
+   MPI_Info_free(&info);
+
+   return local_rank;
+#else
+   return 0;
+#endif
+}
+
 } // namespace MPI
 } // namespace TNL
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
index 9a057da5f..5527ad9af 100644
--- a/src/TNL/MPI/Wrappers.h
+++ b/src/TNL/MPI/Wrappers.h
@@ -22,11 +22,13 @@
 
 #include <TNL/Assert.h>
 #include "getDataType.h"
-#include "selectGPU.h"
 
 namespace TNL {
 namespace MPI {
 
+// forward declaration to break cyclic inclusion
+inline void selectGPU();
+
 // function wrappers for MPI constants
 
 inline MPI_Comm AllGroup()
@@ -345,3 +347,6 @@ void Alltoall( const T* sendData,
 
 } // namespace MPI
 } // namespace TNL
+
+// late inclusion to break cyclic inclusion
+#include "selectGPU.h"
diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h
index def9a329f..781a52809 100644
--- a/src/TNL/MPI/selectGPU.h
+++ b/src/TNL/MPI/selectGPU.h
@@ -10,63 +10,28 @@
 
 #pragma once
 
-#include <cstring>
-
 #include <TNL/Cuda/CheckDevice.h>
 
+#include "Utils.h"
+
 namespace TNL {
 namespace MPI {
-namespace {
-
-#ifdef HAVE_MPI
-#ifdef HAVE_CUDA
-   typedef struct __attribute__((__packed__)) {
-      char name[MPI_MAX_PROCESSOR_NAME];
-   } procName;
-#endif
-#endif
 
 inline void selectGPU()
 {
 #ifdef HAVE_MPI
 #ifdef HAVE_CUDA
-   int size;
-   MPI_Comm_size( MPI_COMM_WORLD, &size );
-   int rank;
-   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    int gpuCount;
-   cudaGetDeviceCount( &gpuCount );
+   cudaGetDeviceCount(&gpuCount);
 
-   procName names[size];
-
-   int i=0;
-   int len;
-   MPI_Get_processor_name(names[rank].name, &len);
-
-   for(i=0;i<size;i++)
-      std::memcpy(names[i].name,names[rank].name,len+1);
-
-   MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-      (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-               MPI_COMM_WORLD);
-
-   int nodeRank=0;
-   for(i=0;i<rank;i++)
-   {
-      if(std::strcmp(names[rank].name,names[i].name)==0)
-         nodeRank++;
-   }
-
-   const int gpuNumber = nodeRank % gpuCount;
+   const int local_rank = getRankOnNode();
+   const int gpuNumber = local_rank % gpuCount;
 
    cudaSetDevice(gpuNumber);
    TNL_CHECK_CUDA_DEVICE;
-
-   //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
 #endif
 #endif
 }
 
-} // namespace <unnamed>
 } // namespace MPI
 } // namespace TNL
-- 
GitLab


From 3ef7f564fd8bba50d94f9f0fd12d55bcbac947c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 31 Dec 2020 23:46:38 +0100
Subject: [PATCH 37/50] MPI refactoring: removed writeProlog from
 MpiCommunicator

---
 src/TNL/Communicators/MpiCommunicator.h       |  9 -------
 src/TNL/Solvers/PDE/PDESolver.h               | 25 +++++++++----------
 src/TNL/Solvers/PDE/PDESolver_impl.h          | 19 +++++++-------
 .../Solvers/PDE/TimeDependentPDESolver_impl.h |  2 +-
 .../PDE/TimeIndependentPDESolver_impl.h       | 18 ++++++-------
 5 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 1995978c5..eaf6ca634 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -278,15 +278,6 @@ class MpiCommunicator
          MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
-
-      static void writeProlog( Logger& logger )
-      {
-         if( isDistributed() )
-         {
-            logger.writeParameter( "MPI processes:", GetSize(AllGroup) );
-         }
-      }
-
       static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup )
       {
 #ifdef HAVE_MPI
diff --git a/src/TNL/Solvers/PDE/PDESolver.h b/src/TNL/Solvers/PDE/PDESolver.h
index b9bbcd5e2..70f19d8de 100644
--- a/src/TNL/Solvers/PDE/PDESolver.h
+++ b/src/TNL/Solvers/PDE/PDESolver.h
@@ -18,8 +18,8 @@
 
 namespace TNL {
 namespace Solvers {
-namespace PDE { 
-   
+namespace PDE {
+
 template< typename Real,
           typename Index >
 class PDESolver
@@ -28,8 +28,8 @@ class PDESolver
       using RealType = Real;
       using IndexType = Index;
       using SolverMonitorType = IterativeSolverMonitor< RealType, IndexType >;
-      
-      
+
+
       PDESolver();
 
       static void configSetup( Config::ConfigDescription& config,
@@ -38,29 +38,28 @@ class PDESolver
       bool setup( const Config::ParameterContainer& parameters,
                   const String& prefix = "" );
 
-      template< typename Communicator >
       bool writeProlog( Logger& logger,
                         const Config::ParameterContainer& parameters );
-      
+
       void setIoTimer( Timer& ioTimer);
 
       void setComputeTimer( Timer& computeTimer );
-      
+
       void setTotalTimer( Timer& totalTimer );
-      
+
       void setSolverMonitor( SolverMonitorType& solverMonitor );
-      
+
       SolverMonitorType& getSolverMonitor();
 
-      bool writeEpilog( Logger& logger ) const;      
-      
+      bool writeEpilog( Logger& logger ) const;
+
    protected:
 
       Timer *ioTimer, *computeTimer, *totalTimer;
-      
+
       SolverMonitorType *solverMonitorPointer;
 };
- 
+
 } // namespace PDE
 } // namespace Solvers
 } // namespace TNL
diff --git a/src/TNL/Solvers/PDE/PDESolver_impl.h b/src/TNL/Solvers/PDE/PDESolver_impl.h
index 37ade9f38..8bdcbd86a 100644
--- a/src/TNL/Solvers/PDE/PDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/PDESolver_impl.h
@@ -11,21 +11,22 @@
 #pragma once
 
 #include <TNL/Solvers/PDE/PDESolver.h>
+#include <TNL/MPI/Utils.h>
 
 namespace TNL {
 namespace Solvers {
-namespace PDE { 
+namespace PDE {
 
 template< typename Real,
-          typename Index >   
-PDESolver< Real, Index >::PDESolver()   
+          typename Index >
+PDESolver< Real, Index >::PDESolver()
 : ioTimer( 0 ),
   computeTimer( 0 ),
   totalTimer( 0 ),
   solverMonitorPointer( 0 )
 {
 }
-   
+
 template< typename Real,
           typename Index >
 void
@@ -65,7 +66,6 @@ getSolverMonitor()
 
 template< typename Real,
           typename Index >
-   template< typename Communicator >
 bool
 PDESolver< Real, Index >::
 writeProlog( Logger& logger,
@@ -84,7 +84,8 @@ writeProlog( Logger& logger,
       else
          logger.writeParameter< String >( "OMP enabled:", "no", 1 );
    }
-   Communicator::writeProlog( logger );
+   if( MPI::isInitialized() )
+      logger.writeParameter( "MPI processes:", MPI::GetSize() );
    logger.writeSeparator();
    const bool printGPUs = parameters.getParameter< String >( "device" ) == "cuda";
    logger.writeSystemInformation( printGPUs );
@@ -116,9 +117,9 @@ void PDESolver< Real, Index >::
 setTotalTimer( Timer& totalTimer )
 {
    this->totalTimer = &totalTimer;
-}  
-   
+}
+
 } // namespace PDE
 } // namespace Solvers
 } // namespace TNL
-   
+
diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
index 46ffa6fea..0c605fb95 100644
--- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
@@ -165,7 +165,7 @@ writeProlog( Logger& logger,
    logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters );
    logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters );
    logger.writeSeparator();
-   return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters );
+   return BaseType::writeProlog( logger, parameters );
 }
 
 template< typename Problem,
diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
index 455682e2b..5292e7f41 100644
--- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
@@ -15,7 +15,7 @@
  *                                                                         *
  ***************************************************************************/
 
-#pragma once 
+#pragma once
 
 #include <TNL/Solvers/PDE/TimeIndependentPDESolver.h>
 #include <TNL/Meshes/TypeResolver/TypeResolver.h>
@@ -23,7 +23,7 @@
 
 namespace TNL {
 namespace Solvers {
-namespace PDE {   
+namespace PDE {
 
 
 template< typename Problem >
@@ -75,7 +75,7 @@ setup( const Config::ParameterContainer& parameters,
       return false;
    }
    problem->setCommonData( this->commonDataPointer );
-   
+
    /****
     * Setup the problem
     */
@@ -83,7 +83,7 @@ setup( const Config::ParameterContainer& parameters,
    {
       std::cerr << "The problem initiation failed!" << std::endl;
       return false;
-   }   
+   }
 
    /****
     * Set DOFs (degrees of freedom)
@@ -91,9 +91,9 @@ setup( const Config::ParameterContainer& parameters,
    TNL_ASSERT_GT( problem->getDofs(), 0, "number of DOFs must be positive" );
    this->dofs->setSize( problem->getDofs() );
    this->dofs->setValue( 0.0 );
-   this->problem->bindDofs( this->dofs );   
-   
-   
+   this->problem->bindDofs( this->dofs );
+
+
    /***
     * Set-up the initial condition
     */
@@ -102,7 +102,7 @@ setup( const Config::ParameterContainer& parameters,
    if( ! this->problem->setInitialCondition( parameters, this->dofs ) )
       return false;
    std::cout << " [ OK ]" << std::endl;
-   
+
    return true;
 }
 
@@ -128,7 +128,7 @@ writeProlog( Logger& logger,
    logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters );
    logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters );
    logger.writeSeparator();
-   return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters );
+   return BaseType::writeProlog( logger, parameters );
 }
 
 template< typename Problem >
-- 
GitLab


From b8ae1e278a68719bcc273fe6a68920aa176422e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 1 Jan 2021 00:04:07 +0100
Subject: [PATCH 38/50] MPI refactoring: moved setup and configSetup from
 MpiCommunicator into a separate header

---
 src/TNL/Communicators/MpiCommunicator.h | 135 ++++++------------------
 src/TNL/MPI.h                           |   1 +
 src/TNL/MPI/Config.h                    | 103 ++++++++++++++++++
 3 files changed, 134 insertions(+), 105 deletions(-)
 create mode 100644 src/TNL/MPI/Config.h

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index eaf6ca634..d3b0401e5 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -10,26 +10,12 @@
 
 #pragma once
 
-#include <iostream>
-
-#ifdef HAVE_MPI
-#ifdef OMPI_MAJOR_VERSION
-   // header specific to OpenMPI (needed for CUDA-aware detection)
-   #include <mpi-ext.h>
-#endif
-
-#include <unistd.h>  // getpid
-#endif
-
-#include <TNL/String.h>
-#include <TNL/Logger.h>
 #include <TNL/MPI/Wrappers.h>
 #include <TNL/MPI/DummyDefs.h>
 #include <TNL/MPI/Utils.h>
-#include <TNL/Config/ConfigDescription.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Exceptions/MPIDimsCreateError.h>
 
-
 namespace TNL {
 //! \brief Namespace for TNL communicators.
 namespace Communicators {
@@ -58,75 +44,13 @@ class MpiCommunicator
 
       static void configSetup( Config::ConfigDescription& config, const String& prefix = "" )
       {
-#ifdef HAVE_MPI
-         config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true );
-         config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." );
-         config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false );
-         config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 );
-#endif
+         MPI::configSetup( config, prefix );
       }
 
       static bool setup( const Config::ParameterContainer& parameters,
                          const String& prefix = "" )
       {
-#ifdef HAVE_MPI
-         if(IsInitialized())//i.e. - isUsed
-         {
-            const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
-            const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
-            if( redirect )
-               MPI::setupRedirection( outputDirectory );
-#ifdef HAVE_CUDA
-            int size;
-            MPI_Comm_size( MPI_COMM_WORLD, &size );
-            if( size > 1 )
-            {
-   #if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT
-               std::cout << "CUDA-aware MPI detected on this system ... " << std::endl;
-   #elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT
-               std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl;
-               return false;
-   #else
-               std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl;
-   #endif
-            }
-#endif // HAVE_CUDA
-            bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" );
-            int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" );
-
-            if( gdbDebug )
-            {
-               int rank = GetRank( MPI_COMM_WORLD );
-               int pid = getpid();
-
-               volatile int tnlMPIDebugAttached = 0;
-               MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD );
-               MPI_Barrier( MPI_COMM_WORLD );
-               if( rank == 0 )
-               {
-                  std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl;
-                  for( int i = 0; i < GetSize( MPI_COMM_WORLD ); i++ )
-                  {
-                     MPI_Status status;
-                     int recvPid;
-                     MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status );
-
-                     if( i == processToAttach || processToAttach == -1 )
-                     {
-                        std::cout << "  For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\""
-                                  << " -ex \"set variable tnlMPIDebugAttached=1\""
-                                  << " -ex \"continue\"" << std::endl;
-                     }
-                  }
-                  std::cout << std::flush;
-               }
-               if( rank == processToAttach || processToAttach == -1 )
-                  while( ! tnlMPIDebugAttached );
-               MPI_Barrier( MPI_COMM_WORLD );
-            }
-         }
-#endif // HAVE_MPI
-         return true;
+         return MPI::setup( parameters, prefix );
       }
 
       static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
@@ -157,32 +81,6 @@ class MpiCommunicator
          return MPI::GetSize( group );
       }
 
-      //dim-number of dimensions, distr array of guess distr - 0 for computation
-      //distr array will be filled by computed distribution
-      //more information in MPI documentation
-      static void DimsCreate(int nproc, int dim, int *distr)
-      {
-#ifdef HAVE_MPI
-         int sum = 0, prod = 1;
-         for( int i = 0;i < dim; i++ ) {
-            sum += distr[ i ];
-            prod *= distr[ i ];
-         }
-         if( prod != 0 && prod != GetSize( AllGroup ) )
-            throw Exceptions::MPIDimsCreateError();
-         if(sum==0) {
-            for(int i=0;i<dim-1;i++)
-               distr[i]=1;
-            distr[dim-1]=0;
-         }
-
-         MPI_Dims_create(nproc, dim, distr);
-#else
-         for(int i=0;i<dim;i++)
-            distr[i]=1;
-#endif
-      }
-
       static void Barrier( CommunicationGroup group = AllGroup )
       {
          MPI::Barrier( group );
@@ -278,6 +176,33 @@ class MpiCommunicator
          MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
+
+      //dim-number of dimensions, distr array of guess distr - 0 for computation
+      //distr array will be filled by computed distribution
+      //more information in MPI documentation
+      static void DimsCreate(int nproc, int dim, int *distr)
+      {
+#ifdef HAVE_MPI
+         int sum = 0, prod = 1;
+         for( int i = 0;i < dim; i++ ) {
+            sum += distr[ i ];
+            prod *= distr[ i ];
+         }
+         if( prod != 0 && prod != GetSize( AllGroup ) )
+            throw Exceptions::MPIDimsCreateError();
+         if(sum==0) {
+            for(int i=0;i<dim-1;i++)
+               distr[i]=1;
+            distr[dim-1]=0;
+         }
+
+         MPI_Dims_create(nproc, dim, distr);
+#else
+         for(int i=0;i<dim;i++)
+            distr[i]=1;
+#endif
+      }
+
       static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup )
       {
 #ifdef HAVE_MPI
diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h
index b1b7dd698..68e0dc48c 100644
--- a/src/TNL/MPI.h
+++ b/src/TNL/MPI.h
@@ -26,4 +26,5 @@
 #include "MPI/Wrappers.h"
 #include "MPI/Utils.h"
 #include "MPI/ScopedInitializer.h"
+#include "MPI/Config.h"
 #include "MPI/Print.h"
diff --git a/src/TNL/MPI/Config.h b/src/TNL/MPI/Config.h
new file mode 100644
index 000000000..d560b1d55
--- /dev/null
+++ b/src/TNL/MPI/Config.h
@@ -0,0 +1,103 @@
+/***************************************************************************
+                          MPI/Config.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+
+#ifdef HAVE_MPI
+#ifdef OMPI_MAJOR_VERSION
+   // header specific to OpenMPI (needed for CUDA-aware detection)
+   #include <mpi-ext.h>
+#endif
+
+#include <unistd.h>  // getpid
+#endif
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+#include "Utils.h"
+
+namespace TNL {
+namespace MPI {
+
+inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" )
+{
+#ifdef HAVE_MPI
+   config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true );
+   config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." );
+   config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false );
+   config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 );
+#endif
+}
+
+inline bool setup( const Config::ParameterContainer& parameters,
+                   const String& prefix = "" )
+{
+#ifdef HAVE_MPI
+   if( Initialized() && ! Finalized() )
+   {
+      const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
+      const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
+      if( redirect )
+         MPI::setupRedirection( outputDirectory );
+#ifdef HAVE_CUDA
+      if( GetSize() > 1 )
+      {
+#if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT
+         std::cout << "CUDA-aware MPI detected on this system ... " << std::endl;
+#elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT
+         std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl;
+         return false;
+#else
+         std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl;
+#endif
+      }
+#endif // HAVE_CUDA
+      bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" );
+      int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" );
+
+      if( gdbDebug )
+      {
+         int rank = GetRank( MPI_COMM_WORLD );
+         int pid = getpid();
+
+         volatile int tnlMPIDebugAttached = 0;
+         MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD );
+         MPI_Barrier( MPI_COMM_WORLD );
+         if( rank == 0 )
+         {
+            std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl;
+            for( int i = 0; i < GetSize(); i++ )
+            {
+               MPI_Status status;
+               int recvPid;
+               MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status );
+
+               if( i == processToAttach || processToAttach == -1 )
+               {
+                  std::cout << "  For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\""
+                            << " -ex \"set variable tnlMPIDebugAttached=1\""
+                            << " -ex \"continue\"" << std::endl;
+               }
+            }
+            std::cout << std::flush;
+         }
+         if( rank == processToAttach || processToAttach == -1 )
+            while( ! tnlMPIDebugAttached );
+         MPI_Barrier( MPI_COMM_WORLD );
+      }
+   }
+#endif // HAVE_MPI
+   return true;
+}
+
+} // namespace MPI
+} // namespace TNL
-- 
GitLab


From 0742d2a2406f4e52d5217df4c0c3944f491ec282 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 1 Jan 2021 00:13:00 +0100
Subject: [PATCH 39/50] MPI refactoring: replaced MPIDimsCreateError with
 std::logic_error

---
 src/TNL/Communicators/MpiCommunicator.h |  5 +++--
 src/TNL/Exceptions/MPIDimsCreateError.h | 28 -------------------------
 2 files changed, 3 insertions(+), 30 deletions(-)
 delete mode 100644 src/TNL/Exceptions/MPIDimsCreateError.h

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index d3b0401e5..c155cabbe 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -14,7 +14,6 @@
 #include <TNL/MPI/DummyDefs.h>
 #include <TNL/MPI/Utils.h>
 #include <TNL/MPI/Config.h>
-#include <TNL/Exceptions/MPIDimsCreateError.h>
 
 namespace TNL {
 //! \brief Namespace for TNL communicators.
@@ -189,7 +188,9 @@ class MpiCommunicator
             prod *= distr[ i ];
          }
          if( prod != 0 && prod != GetSize( AllGroup ) )
-            throw Exceptions::MPIDimsCreateError();
+            throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions."
+                                    "Non of the dimensions is zero and product of all dimensions does "
+                                    "not fit with number of MPI processes." );
          if(sum==0) {
             for(int i=0;i<dim-1;i++)
                distr[i]=1;
diff --git a/src/TNL/Exceptions/MPIDimsCreateError.h b/src/TNL/Exceptions/MPIDimsCreateError.h
deleted file mode 100644
index 1cb1a8f2e..000000000
--- a/src/TNL/Exceptions/MPIDimsCreateError.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
-                          MPIDimsCreateError.h  -  description
-                             -------------------
-    begin                : Jan 30, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <stdexcept>
-
-namespace TNL {
-namespace Exceptions {
-
-struct MPIDimsCreateError
-   : public std::runtime_error
-{
-   MPIDimsCreateError()
-   : std::runtime_error( "The program tries to call MPI_Dims_create with wrong dimensions."
-                         "Non of the dimensions is zero and product of all dimensions does not fit with number of MPI processes." )
-   {}
-};
-
-} // namespace Exceptions
-} // namespace TNL
-- 
GitLab


From eb8b40dcbd55fd2304d02f55ffc71a0c674cb65e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 1 Jan 2021 11:31:33 +0100
Subject: [PATCH 40/50] MPI: added function for timing Allreduce operations

---
 src/TNL/MPI.h           |  1 +
 src/TNL/MPI/Profiling.h | 25 +++++++++++++++++++++++++
 src/TNL/MPI/Wrappers.h  |  5 +++++
 3 files changed, 31 insertions(+)
 create mode 100644 src/TNL/MPI/Profiling.h

diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h
index 68e0dc48c..a5f9145b5 100644
--- a/src/TNL/MPI.h
+++ b/src/TNL/MPI.h
@@ -22,6 +22,7 @@
 
 #include "MPI/DummyDefs.h"
 #include "MPI/getDataType.h"
+#include "MPI/Profiling.h"
 #include "MPI/selectGPU.h"
 #include "MPI/Wrappers.h"
 #include "MPI/Utils.h"
diff --git a/src/TNL/MPI/Profiling.h b/src/TNL/MPI/Profiling.h
new file mode 100644
index 000000000..d50427c16
--- /dev/null
+++ b/src/TNL/MPI/Profiling.h
@@ -0,0 +1,25 @@
+/***************************************************************************
+                          MPI/Profiling.h  -  description
+                             -------------------
+    begin                : Jan 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Timer.h>
+
+namespace TNL {
+namespace MPI {
+
+inline Timer& getTimerAllreduce()
+{
+   static Timer t;
+   return t;
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
index 5527ad9af..39344a128 100644
--- a/src/TNL/MPI/Wrappers.h
+++ b/src/TNL/MPI/Wrappers.h
@@ -22,6 +22,7 @@
 
 #include <TNL/Assert.h>
 #include "getDataType.h"
+#include "Profiling.h"
 
 namespace TNL {
 namespace MPI {
@@ -278,7 +279,9 @@ void Allreduce( const T* data,
 {
 #ifdef HAVE_MPI
    TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   getTimerAllreduce().start();
    MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, group );
+   getTimerAllreduce().stop();
 #else
    std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) );
 #endif
@@ -293,7 +296,9 @@ void Allreduce( T* data,
 {
 #ifdef HAVE_MPI
    TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   getTimerAllreduce().start();
    MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType<T>(), op, group );
+   getTimerAllreduce().stop();
 #endif
 }
 
-- 
GitLab


From 5e7005a67fd1f93b6d295e01c561f5bd1dae88f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 1 Jan 2021 15:04:06 +0100
Subject: [PATCH 41/50] MPI refactoring: removed MpiCommunicator from solvers:
 Merson, GMRES, Linear/Traits.h

---
 src/TNL/Solvers/Linear/GMRES.h      |  3 ---
 src/TNL/Solvers/Linear/GMRES_impl.h | 10 +++++-----
 src/TNL/Solvers/Linear/Traits.h     | 10 +++-------
 src/TNL/Solvers/ODE/Merson_impl.h   | 10 +++++-----
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index e1c02f0ab..818f1c163 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -23,10 +23,7 @@ class GMRES
 : public LinearSolver< Matrix >
 {
    using Base = LinearSolver< Matrix >;
-
-   // compatibility shortcuts
    using Traits = Linear::Traits< Matrix >;
-   using CommunicatorType = typename Traits::CommunicatorType;
 
 public:
    using RealType = typename Base::RealType;
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 23b563940..3b13e0b28 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -510,7 +510,7 @@ hauseholder_generate( const int i,
       norm_yi_squared = 2 * (normz * normz + std::fabs( y_ii ) * normz);
    }
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 
    // XXX: normalization is slower, but more stable
 //   y_i *= 1.0 / std::sqrt( norm_yi_squared );
@@ -534,7 +534,7 @@ hauseholder_generate( const int i,
                  i,
                  aux );
       // no-op if the problem is not distributed
-      CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
+      MPI::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
       // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
       for( int k = 0; k < i; k++ ) {
@@ -559,7 +559,7 @@ hauseholder_apply_trunc( HostView out,
    HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 );
    Algorithms::MultiDeviceMemoryOperations< Devices::Host, DeviceType >::copy( YL_i.getData(), Traits::getLocalView( y_i ).getData(), YL_i.getSize() );
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) );
 
    // NOTE: aux = t_i * (y_i, z) = 1  since  t_i = 2 / ||y_i||^2  and
    //       (y_i, z) = ||z_trunc||^2 + |z_i| ||z_trunc|| = ||y_i||^2 / 2
@@ -579,7 +579,7 @@ hauseholder_apply_trunc( HostView out,
    }
 
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 }
 
 template< typename Matrix >
@@ -634,7 +634,7 @@ hauseholder_cwy_transposed( VectorViewType z,
               i + 1,
               aux );
    // no-op if the problem is not distributed
-   Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
    // aux = T_i^T * aux
    // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 83313ed98..7a1879923 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -26,8 +26,6 @@ namespace Linear {
 template< typename Matrix >
 struct Traits
 {
-   using CommunicatorType = Communicators::MpiCommunicator;
-
    using VectorType = Containers::Vector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
@@ -51,7 +49,7 @@ struct Traits
    static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v; }
    static LocalViewType getLocalView( VectorViewType v ) { return v; }
 
-   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; }
+   static MPI_Comm getCommunicationGroup( const Matrix& m ) { return MPI::AllGroup(); }
    static void startSynchronization( VectorViewType v ) {}
    static void waitForSynchronization( VectorViewType v ) {}
 };
@@ -59,8 +57,6 @@ struct Traits
 template< typename Matrix, typename Communicator >
 struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
 {
-   using CommunicatorType = Communicator;
-
    using VectorType = Containers::DistributedVector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
@@ -96,7 +92,7 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
    static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); }
    static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); }
 
-   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
+   static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
    static void startSynchronization( VectorViewType v ) { v.startSynchronization(); }
    static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); }
 };
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 82a6a87ff..247318f33 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -13,13 +13,13 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 
 #include "Merson.h"
 
 namespace TNL {
 namespace Solvers {
-namespace ODE {   
+namespace ODE {
 
 /****
  * In this code we do not use constants and references as we would like to.
@@ -154,9 +154,9 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
       RealType error( 0.0 );
       if( adaptivity != 0.0 )
       {
-         const RealType localError = 
+         const RealType localError =
             max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) );
-            Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+            MPI::Allreduce( &localError, &error, 1, MPI_MAX, MPI::AllGroup() );
       }
 
       if( adaptivity == 0.0 || error < adaptivity )
@@ -185,7 +185,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
          TNLMPI::Bcast( currentTau, 1, 0 );
-#endif        
+#endif
       }
       if( time + currentTau > this->getStopTime() )
          currentTau = this->getStopTime() - time; //we don't want to keep such tau
-- 
GitLab


From 3c5d17e38a10f8a4b306d16c73c462416a39e604 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 1 Jan 2021 21:55:37 +0100
Subject: [PATCH 42/50] MPI refactoring: removed MpiCommunicator from
 DistributedNDArray, added Allocator parameter to NDArray

---
 src/TNL/Containers/DistributedNDArray.h       | 44 ++++++-----
 .../DistributedNDArraySynchronizer.h          | 62 +++++++--------
 src/TNL/Containers/DistributedNDArrayView.h   | 21 +++--
 src/TNL/Containers/NDArray.h                  | 78 ++++++++++++++++---
 src/TNL/Containers/Partitioner.h              |  2 +-
 .../DistributedNDArrayOverlaps_1D_test.h      | 16 ++--
 .../DistributedNDArrayOverlaps_semi1D_test.h  | 16 ++--
 .../ndarray/DistributedNDArray_1D_test.h      | 20 ++---
 .../ndarray/DistributedNDArray_semi1D_test.h  | 20 ++---
 9 files changed, 157 insertions(+), 122 deletions(-)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 57b94a34b..c49e9e31b 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -12,34 +12,30 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/NDArray.h>
-#include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename NDArray,
-          typename Communicator = Communicators::MpiCommunicator,
           typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > >
 class DistributedNDArray
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = typename NDArray::ValueType;
    using DeviceType = typename NDArray::DeviceType;
    using IndexType = typename NDArray::IndexType;
+   using AllocatorType = typename NDArray::AllocatorType;
    using SizesHolderType = typename NDArray::SizesHolderType;
    using PermutationType = typename NDArray::PermutationType;
-   using CommunicatorType = Communicator;
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
    using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >;
 
-   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
-   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;
+   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Overlaps >;
    using LocalViewType = typename NDArray::ViewType;
    using ConstLocalViewType = typename NDArray::ConstViewType;
 
@@ -49,10 +45,17 @@ public:
 
    DistributedNDArray() = default;
 
-   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
-   // copy-constructor cannot be default. Actually, we most likely don't need
-   // it anyway, so let's just delete it.
-   DistributedNDArray( const DistributedNDArray& ) = delete;
+   DistributedNDArray( const AllocatorType& allocator );
+
+   // Copy constructor (makes a deep copy).
+   explicit DistributedNDArray( const DistributedNDArray& ) = default;
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit DistributedNDArray( const DistributedNDArray& other, const AllocatorType& allocator )
+   : localArray( allocator )
+   {
+      *this = other;
+   }
 
    // Standard copy-semantics with deep copy, just like regular 1D array.
    // Mismatched sizes cause reallocations.
@@ -79,8 +82,13 @@ public:
       return NDArray::getDimension();
    }
 
+   AllocatorType getAllocator() const
+   {
+      return localArray.getAllocator();
+   }
+
    __cuda_callable__
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -232,8 +240,8 @@ public:
             localEnds == other.localEnds &&
             localArray == other.localArray;
       bool result = true;
-      if( group != CommunicatorType::NullGroup )
-         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      if( group != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
       return result;
    }
 
@@ -375,7 +383,7 @@ public:
    }
 
    template< std::size_t level >
-   void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup )
+   void setDistribution( IndexType begin, IndexType end, MPI_Comm group = MPI::AllGroup() )
    {
       static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." );
       TNL_ASSERT_GE( begin, 0, "begin must be non-negative" );
@@ -383,7 +391,7 @@ public:
       TNL_ASSERT_LT( begin, end, "begin must be lesser than end" );
       localBegins.template setSize< level >( begin );
       localEnds.template setSize< level >( end );
-      TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group,
+      TNL_ASSERT( this->group == MPI::NullGroup() || this->group == group,
                   std::cerr << "different groups cannot be combined for different dimensions" );
       this->group = group;
    }
@@ -408,7 +416,7 @@ public:
    void reset()
    {
       localArray.reset();
-      group = CommunicatorType::NullGroup;
+      group = MPI::NullGroup();
       globalSizes = SizesHolderType{};
       localBegins = LocalBeginsType{};
       localEnds = SizesHolderType{};
@@ -435,7 +443,7 @@ public:
 
 protected:
    NDArray localArray;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    SizesHolderType globalSizes;
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
index bcec4a7b4..cea40bc21 100644
--- a/src/TNL/Containers/DistributedNDArraySynchronizer.h
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -15,6 +15,7 @@
 #include <future>
 
 #include <TNL/Containers/ndarray/SynchronizerBuffers.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -69,7 +70,6 @@ public:
 
 protected:
    using DistributedNDArrayView = typename DistributedNDArray::ViewType;
-   using Communicator = typename DistributedNDArray::CommunicatorType;
    using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >;
 
    DistributedNDArrayView array_view;
@@ -88,12 +88,12 @@ protected:
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true );
 
       // issue all send and receive async operations
-      std::vector< typename Communicator::Request > requests;
-      const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+      std::vector< MPI_Request > requests;
+      const MPI_Comm group = array_view.getCommunicationGroup();
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group );
 
       // wait until send is done
-      Communicator::WaitAll( requests.data(), requests.size() );
+      MPI::Waitall( requests.data(), requests.size() );
 
       // copy data from receive buffers
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false );
@@ -152,9 +152,9 @@ protected:
          dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
 
          // FIXME: set proper neighbor IDs !!!
-         const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
-         const int rank = Communicator::GetRank(group);
-         const int nproc = Communicator::GetSize(group);
+         const MPI_Comm group = array_view.getCommunicationGroup();
+         const int rank = MPI::GetRank(group);
+         const int nproc = MPI::GetSize(group);
          dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
          dim_buffers.right_neighbor = (rank + 1) % nproc;
       }
@@ -221,32 +221,32 @@ protected:
          auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
          if( LBM_HACK == false ) {
-            requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData(),
-                                                     dim_buffers.left_send_view.getStorageSize(),
-                                                     dim_buffers.left_neighbor, 0, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData(),
-                                                     dim_buffers.left_recv_view.getStorageSize(),
-                                                     dim_buffers.left_neighbor, 1, group ) );
-            requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData(),
-                                                     dim_buffers.right_send_view.getStorageSize(),
-                                                     dim_buffers.right_neighbor, 1, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData(),
-                                                     dim_buffers.right_recv_view.getStorageSize(),
-                                                     dim_buffers.right_neighbor, 0, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
+                                            dim_buffers.left_send_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, 0, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
+                                            dim_buffers.left_recv_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, 1, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
+                                            dim_buffers.right_send_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
+                                            dim_buffers.right_recv_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, 0, group ) );
          }
          else {
-            requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData() + 0,
-                                                     dim_buffers.left_send_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.left_neighbor, 0, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                                     dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.left_neighbor, 1, group ) );
-            requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                                     dim_buffers.right_send_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.right_neighbor, 1, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData() + 0,
-                                                     dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.right_neighbor, 0, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
+                                            dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.left_neighbor, 0, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                            dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.left_neighbor, 1, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                            dim_buffers.right_send_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.right_neighbor, 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
+                                            dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.right_neighbor, 0, group ) );
          }
       }
    };
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index 102985e9c..4812bf5c0 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -12,33 +12,30 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/NDArrayView.h>
 #include <TNL/Containers/Subrange.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename NDArrayView,
-          typename Communicator = Communicators::MpiCommunicator,
           typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > >
 class DistributedNDArrayView
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = typename NDArrayView::ValueType;
    using DeviceType = typename NDArrayView::DeviceType;
    using IndexType = typename NDArrayView::IndexType;
    using SizesHolderType = typename NDArrayView::SizesHolderType;
    using PermutationType = typename NDArrayView::PermutationType;
-   using CommunicatorType = Communicator;
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
    using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >;
 
-   using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
-   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;
+   using ViewType = DistributedNDArrayView< NDArrayView, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Overlaps >;
    using LocalViewType = NDArrayView;
    using ConstLocalViewType = typename NDArrayView::ConstViewType;
 
@@ -49,7 +46,7 @@ public:
 
    // explicit initialization by local array view, global sizes and local begins and ends
    __cuda_callable__
-   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group )
+   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, MPI_Comm group )
    : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {}
 
    // Copy-constructor does shallow copy, so views can be passed-by-value into
@@ -112,7 +109,7 @@ public:
    void reset()
    {
       localView.reset();
-      group = CommunicatorType::NullGroup;
+      group = MPI::NullGroup();
       globalSizes = SizesHolderType{};
       localBegins = LocalBeginsType{};
       localEnds = SizesHolderType{};
@@ -124,7 +121,7 @@ public:
    }
 
    __cuda_callable__
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -276,8 +273,8 @@ public:
             localEnds == other.localEnds &&
             localView == other.localView;
       bool result = true;
-      if( group != CommunicatorType::NullGroup )
-         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      if( group != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
       return result;
    }
 
@@ -406,7 +403,7 @@ public:
 
 protected:
    NDArrayView localView;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    SizesHolderType globalSizes;
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 7b8a2f31c..f8ba157ba 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -59,10 +59,8 @@ public:
 
    NDArrayStorage() = default;
 
-   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
-   // copy-constructor cannot be default. Actually, we most likely don't need
-   // it anyway, so let's just delete it.
-   NDArrayStorage( const NDArrayStorage& ) = delete;
+   // Copy constructor (makes a deep copy).
+   explicit NDArrayStorage( const NDArrayStorage& ) = default;
 
    // Standard copy-semantics with deep copy, just like regular 1D array.
    // Mismatched sizes cause reallocations.
@@ -326,21 +324,49 @@ template< typename Value,
           typename SizesHolder,
           typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
           typename Device = Devices::Host,
-          typename Index = typename SizesHolder::IndexType >
+          typename Index = typename SizesHolder::IndexType,
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class NDArray
-: public NDArrayStorage< Array< Value, Device, Index >,
+: public NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
 {
-   using Base = NDArrayStorage< Array< Value, Device, Index >,
+   using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;
 
 public:
-   // inherit all assignment operators
+   // inherit all constructors and assignment operators
+   using Base::Base;
    using Base::operator=;
+
+   // default constructor
+   NDArray() = default;
+
+   // implement dynamic array interface
+   using AllocatorType = Allocator;
+
+   NDArray( const NDArray& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator );
+   }
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit NDArray( const NDArray& other, const AllocatorType& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->array = Array< Value, Device, Index, Allocator >( allocator );
+      // copy the data
+      *this = other;
+   }
+
+   AllocatorType getAllocator() const
+   {
+      return this->array.getAllocator();
+   }
 };
 
 template< typename Value,
@@ -372,21 +398,49 @@ template< typename Value,
           typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
           typename SliceInfo = SliceInfo<>,  // no slicing by default
           typename Device = Devices::Host,
-          typename Index = typename SizesHolder::IndexType >
+          typename Index = typename SizesHolder::IndexType,
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class SlicedNDArray
-: public NDArrayStorage< Array< Value, Device, Index >,
+: public NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::SlicedNDArrayBase< SliceInfo > >
 {
-   using Base = NDArrayStorage< Array< Value, Device, Index >,
+   using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::SlicedNDArrayBase< SliceInfo > >;
 
 public:
-   // inherit all assignment operators
+   // inherit all constructors and assignment operators
+   using Base::Base;
    using Base::operator=;
+
+   // default constructor
+   SlicedNDArray() = default;
+
+   // implement dynamic array interface
+   using AllocatorType = Allocator;
+
+   SlicedNDArray( const SlicedNDArray& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator );
+   }
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit SlicedNDArray( const SlicedNDArray& other, const AllocatorType& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->array = Array< Value, Device, Index, Allocator >( allocator );
+      // copy the data
+      *this = other;
+   }
+
+   AllocatorType getAllocator() const
+   {
+      return this->array.getAllocator();
+   }
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h
index 32ba735e5..c2dce9e34 100644
--- a/src/TNL/Containers/Partitioner.h
+++ b/src/TNL/Containers/Partitioner.h
@@ -22,7 +22,7 @@
 namespace TNL {
 namespace Containers {
 
-template< typename Index, typename Communicator >
+template< typename Index, typename Communicator = Communicators::MpiCommunicator >
 class Partitioner
 {
    using CommunicationGroup = typename Communicator::CommunicationGroup;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 366535cc7..36c4ea5b7 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -9,7 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
@@ -33,7 +32,6 @@ class DistributedNDArrayOverlaps_1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -44,17 +42,17 @@ protected:
    const int globalSize = 97;  // prime number to force non-uniform distribution
    const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} );
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArrayOverlaps_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( globalSize );
       distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -70,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Host >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
 #ifdef HAVE_CUDA
    ,
@@ -78,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Cuda >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
 #endif
 >;
@@ -87,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index aba9420f0..0b6838639 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -9,7 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
@@ -33,7 +32,6 @@ class DistributedNDArrayOverlaps_semi1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -44,17 +42,17 @@ protected:
    const int globalSize = 97;  // prime number to force non-uniform distribution
    const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} );
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArrayOverlaps_semi1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
       distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -70,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Host >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
 #ifdef HAVE_CUDA
    ,
@@ -78,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Cuda >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
 #endif
 >;
@@ -87,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayType
 
 TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index 3c637de4d..e55192971 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -9,7 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
@@ -32,7 +31,6 @@ class DistributedNDArray_1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -42,17 +40,17 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArray_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( globalSize );
       distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -67,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                Devices::Host >,
-                       Communicators::MpiCommunicator >
+                                Devices::Host > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                Devices::Cuda >,
-                       Communicators::MpiCommunicator >
+                                Devices::Cuda > >
 #endif
 >;
 
@@ -83,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
 }
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index 93d6c3036..e3cbb3223 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -9,7 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
@@ -32,7 +31,6 @@ class DistributedNDArray_semi1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -42,17 +40,17 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArray_semi1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
       distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -67,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                Devices::Host >,
-                       Communicators::MpiCommunicator >
+                                Devices::Host > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                Devices::Cuda >,
-                       Communicators::MpiCommunicator >
+                                Devices::Cuda > >
 #endif
 >;
 
@@ -83,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
 }
-- 
GitLab


From ee2fd25dc248f65751ec4936a8227c67be45be4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 10:11:55 +0100
Subject: [PATCH 43/50] MPI refactoring: removed MpiCommunicator from algebraic
 data structures

This affects DistributedArray, DistributedArrayView, DistributedVector,
DistributedVectorView and DistributedMatrix. Allocators were added to
DistributedArray and DistributedVector. Also updated all benchmarks and
unit tests.
---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |  30 ++-
 src/Benchmarks/LinearSolvers/benchmarks.h     |   6 +-
 .../tnl-benchmark-linear-solvers.h            |  24 +-
 src/Benchmarks/ODESolvers/Euler.hpp           |   2 +-
 src/Benchmarks/ODESolvers/Merson.hpp          |  10 +-
 .../ODESolvers/tnl-benchmark-ode-solvers.h    |  16 +-
 src/TNL/Algorithms/DistributedScan.h          |  10 +-
 src/TNL/Containers/DistributedArray.h         |  49 +++--
 src/TNL/Containers/DistributedArray.hpp       | 206 +++++++++--------
 src/TNL/Containers/DistributedArrayView.h     |  26 +--
 src/TNL/Containers/DistributedArrayView.hpp   | 208 ++++++++----------
 src/TNL/Containers/DistributedVector.h        |  26 ++-
 src/TNL/Containers/DistributedVector.hpp      | 118 +++++-----
 src/TNL/Containers/DistributedVectorView.h    |  24 +-
 src/TNL/Containers/DistributedVectorView.hpp  | 137 +++++-------
 .../Expressions/DistributedComparison.h       |  62 +++---
 .../DistributedExpressionTemplates.h          |  16 +-
 .../DistributedVerticalOperations.h           |  60 +++--
 src/TNL/Containers/Partitioner.h              |  31 ++-
 src/TNL/Matrices/DistributedMatrix.h          |  42 +---
 src/TNL/Matrices/DistributedMatrix_impl.h     | 150 ++++++-------
 src/TNL/Matrices/DistributedSpMV.h            |  44 ++--
 .../Solvers/Linear/Preconditioners/Diagonal.h |   8 +-
 .../Linear/Preconditioners/Diagonal_impl.h    |   8 +-
 src/TNL/Solvers/Linear/Preconditioners/ILU0.h |   8 +-
 src/TNL/Solvers/Linear/Traits.h               |  18 +-
 src/TNL/TypeTraits.h                          |  17 ++
 .../Containers/DistributedArrayTest.h         |  21 +-
 .../Containers/DistributedVectorTest.h        |  21 +-
 .../Containers/VectorBinaryOperationsTest.h   |  51 ++---
 .../Containers/VectorHelperFunctions.h        |   1 +
 .../Containers/VectorUnaryOperationsTest.h    |  42 ++--
 .../Containers/VectorVerticalOperationsTest.h |  29 ++-
 .../Matrices/DistributedMatrixTest.h          |  23 +-
 34 files changed, 741 insertions(+), 803 deletions(-)

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index abe08210d..e8b5c9de1 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -19,8 +19,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -38,8 +38,6 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index,
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 
 template< typename Matrix, typename Vector >
 void
@@ -110,7 +108,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
    // benchmark function
    auto compute = [&]() {
       matrix.vectorProduct( x, y );
-      Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
+      TNL::MPI::Barrier( matrix.getCommunicationGroup() );
    };
 
    benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
@@ -150,9 +148,9 @@ struct SpmvBenchmark
    using IndexType = typename MatrixType::IndexType;
    using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
 
-   using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >;
-   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using Partitioner = Containers::Partitioner< IndexType >;
+   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
    using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
 
    static bool
@@ -169,7 +167,7 @@ struct SpmvBenchmark
       matrix.getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (CommunicatorType::isDistributed()) ? "DistSpMV" : "SpMV" )
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       benchmark.setMetadataColumns( Benchmark::MetadataColumns({
@@ -189,13 +187,13 @@ struct SpmvBenchmark
          getTrivialOrdering( matrix, perm, iperm );
          MatrixType matrix_perm;
          Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm );
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix_perm, vector );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector );
       }
       else {
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix, vector );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix, vector );
@@ -225,7 +223,7 @@ struct SpmvBenchmark
                    VectorType& vector )
    {
       // set up the distributed matrix
-      const auto group = CommunicatorType::AllGroup;
+      const auto group = TNL::MPI::AllGroup();
       const auto localRange = Partitioner::splitRange( matrix.getRows(), group );
       DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group );
       DistributedVector distributedVector( localRange, 0, matrix.getRows(), group );
@@ -267,8 +265,8 @@ struct SpmvBenchmark
       DistributedVector distributedY;
       distributedY.setLike( distributedVector );
       distributedMatrix.vectorProduct( distributedVector, distributedY );
-      const int rank = CommunicatorType::GetRank( distributedMatrix.getCommunicationGroup() );
-      const int nproc = CommunicatorType::GetSize( distributedMatrix.getCommunicationGroup() );
+      const int rank = TNL::MPI::GetRank( distributedMatrix.getCommunicationGroup() );
+      const int nproc = TNL::MPI::GetSize( distributedMatrix.getCommunicationGroup() );
       typename VectorType::ViewType subY( &y[ Partitioner::getOffset( matrix.getRows(), rank, nproc ) ],
                                           Partitioner::getSizeForRank( matrix.getRows(), rank, nproc ) );
       TNL_ASSERT_EQ( distributedY.getLocalView(), subY, "WRONG RESULT !!!" );
@@ -294,7 +292,7 @@ configSetup( Config::ConfigDescription & config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int
@@ -310,14 +308,14 @@ main( int argc, char* argv[] )
    configSetup( conf_desc );
 
    TNL::MPI::ScopedInitializer mpi(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a4c04578d..c10c996e3 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -33,10 +33,10 @@ void barrier( const Matrix& matrix )
 {
 }
 
-template< typename Matrix, typename Communicator >
-void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix )
+template< typename Matrix >
+void barrier( const Matrices::DistributedMatrix< Matrix >& matrix )
 {
-   Communicator::Barrier( matrix.getCommunicationGroup() );
+   TNL::MPI::Barrier( matrix.getCommunicationGroup() );
 }
 
 template< typename Device >
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 75b1e0e25..3acfb2438 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -24,8 +24,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -65,8 +65,6 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 
 static const std::set< std::string > valid_solvers = {
    "gmres",
@@ -333,9 +331,9 @@ struct LinearSolversBenchmark
    using IndexType = typename MatrixType::IndexType;
    using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
 
-   using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >;
-   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using Partitioner = Containers::Partitioner< IndexType >;
+   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
    using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
 
    static bool
@@ -383,7 +381,7 @@ struct LinearSolversBenchmark
       matrixPointer->getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" )
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       benchmark.setMetadataColumns( Benchmark::MetadataColumns({
@@ -408,13 +406,13 @@ struct LinearSolversBenchmark
          Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm );
          Matrices::reorderArray( x0, x0_perm, perm );
          Matrices::reorderArray( b, b_perm, perm );
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
       }
       else {
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
          else
             runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
@@ -432,7 +430,7 @@ struct LinearSolversBenchmark
                    const VectorType& b )
    {
       // set up the distributed matrix
-      const auto group = CommunicatorType::AllGroup;
+      const auto group = TNL::MPI::AllGroup();
       const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group );
       SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group );
       DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group );
@@ -567,7 +565,7 @@ configSetup( Config::ConfigDescription& config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 
    config.addDelimiter( "Linear solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
@@ -593,13 +591,13 @@ main( int argc, char* argv[] )
    configSetup( conf_desc );
 
    TNL::MPI::ScopedInitializer mpi(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp
index 5039417b7..fcc8654be 100644
--- a/src/Benchmarks/ODESolvers/Euler.hpp
+++ b/src/Benchmarks/ODESolvers/Euler.hpp
@@ -200,7 +200,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
    }
 
    localResidue /= tau * ( RealType ) size;
-   Problem::CommunicatorType::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup );
+   TNL::MPI::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, TNL::MPI::AllGroup() );
    //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl;
 }
 
diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp
index 1fd8f8a2b..b45faa1b4 100644
--- a/src/Benchmarks/ODESolvers/Merson.hpp
+++ b/src/Benchmarks/ODESolvers/Merson.hpp
@@ -185,13 +185,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u )
          time += currentTau;
          computeNewTimeLevel( time, currentTau, u, newResidue );
          this->setResidue( newResidue );
- 
+
          /****
           * When time is close to stopTime the new residue
           * may be inaccurate significantly.
           */
          if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
-         
+
 
          if( ! this->nextIteration() )
             return false;
@@ -207,7 +207,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u )
          currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
          TNLMPI::Bcast( currentTau, 1, 0 );
-#endif        
+#endif
       }
       if( time + currentTau > this->getStopTime() )
          currentTau = this->getStopTime() - time; //we don't want to keep such tau
@@ -403,7 +403,7 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con
       }
 #endif
    }
-   Problem::CommunicatorType::Allreduce( &eps, &maxEps, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+   TNL::MPI::Allreduce( &eps, &maxEps, 1, MPI_MAX, TNL::MPI::AllGroup() );
    return maxEps;
 }
 
@@ -465,7 +465,7 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time,
    }
 
    localResidue /= tau * ( RealType ) size;
-   Problem::CommunicatorType::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup);
+   TNL::MPI::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, TNL::MPI::AllGroup() );
 /*#ifdef USE_MPI
    TNLMPI::Allreduce( localResidue, currentResidue, 1, MPI_SUM);
 #else
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index fcaaaedf2..0d8d3c04e 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -23,8 +23,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
 
@@ -38,8 +38,6 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 
 template< typename Real, typename Index >
 void
@@ -113,7 +111,7 @@ struct ODESolversBenchmark
         Benchmark::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
-      const String name = String( (CommunicatorType::isDistributed()) ? "Distributed ODE solvers" : "ODE solvers" );
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" );
                           //+ " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) {
@@ -122,7 +120,7 @@ struct ODESolversBenchmark
             { "DOFs", convertToString( dofs ) },
          } ));
 
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, dofs );
          else
             runNonDistributed( benchmark, metadata, parameters, dofs );
@@ -136,7 +134,7 @@ struct ODESolversBenchmark
                    const Config::ParameterContainer& parameters,
                    size_t dofs )
    {
-      //const auto group = CommunicatorType::AllGroup;
+      //const auto group = TNL::MPI::AllGroup();
 
       std::cout << "Iterative solvers:" << std::endl;
       benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs );
@@ -204,7 +202,7 @@ configSetup( Config::ConfigDescription& config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 
    config.addDelimiter( "ODE solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
@@ -226,13 +224,13 @@ main( int argc, char* argv[] )
    configSetup( conf_desc );
 
    TNL::MPI::ScopedInitializer mpi(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h
index 742acd5ed..aa7c008a7 100644
--- a/src/TNL/Algorithms/DistributedScan.h
+++ b/src/TNL/Algorithms/DistributedScan.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Algorithms/Scan.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Algorithms {
@@ -32,10 +33,9 @@ struct DistributedScan
    {
       using RealType = typename DistributedVector::RealType;
       using DeviceType = typename DistributedVector::DeviceType;
-      using CommunicatorType = typename DistributedVector::CommunicatorType;
 
       const auto group = v.getCommunicationGroup();
-      if( group != CommunicatorType::NullGroup ) {
+      if( group != MPI::NullGroup() ) {
          // adjust begin and end for the local range
          const auto localRange = v.getLocalRange();
          begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
@@ -47,18 +47,18 @@ struct DistributedScan
          const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
 
          // exchange local sums between ranks
-         const int nproc = CommunicatorType::GetSize( group );
+         const int nproc = MPI::GetSize( group );
          RealType dataForScatter[ nproc ];
          for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
          Containers::Vector< RealType, Devices::Host > rankSums( nproc );
          // NOTE: exchanging general data types does not work with MPI
-         CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
+         MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
 
          // compute the scan of the per-rank sums
          Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
 
          // perform second phase: shift by the per-block and per-rank offsets
-         const int rank = CommunicatorType::GetRank( group );
+         const int rank = MPI::GetRank( group );
          Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
       }
    }
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 33e96ca9a..3947bfec4 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -21,22 +21,21 @@ namespace Containers {
 template< typename Value,
           typename Device = Devices::Host,
           typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class DistributedArray
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using LocalArrayType = Containers::Array< Value, Device, Index >;
+   using LocalArrayType = Containers::Array< Value, Device, Index, Allocator >;
 
 public:
    using ValueType = Value;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
+   using AllocatorType = Allocator;
    using LocalRangeType = Subrange< Index >;
    using LocalViewType = Containers::ArrayView< Value, Device, Index >;
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
-   using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
-   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using ViewType = DistributedArrayView< Value, Device, Index >;
+   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >;
    using SynchronizerType = typename ViewType::SynchronizerType;
 
    /**
@@ -45,26 +44,50 @@ public:
    template< typename _Value,
              typename _Device = Device,
              typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedArray< _Value, _Device, _Index, _Communicator >;
+             typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Value > >
+   using Self = DistributedArray< _Value, _Device, _Index, _Allocator >;
 
 
    ~DistributedArray();
 
+   /**
+    * \brief Constructs an empty array with zero size.
+    */
    DistributedArray() = default;
 
-   // Copy-constructor does deep copy.
-   DistributedArray( const DistributedArray& );
+   /**
+    * \brief Constructs an empty array and sets the provided allocator.
+    *
+    * \param allocator The allocator to be associated with this array.
+    */
+   explicit DistributedArray( const AllocatorType& allocator );
 
-   DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   /**
+    * \brief Copy constructor (makes a deep copy).
+    *
+    * \param array The array to be copied.
+    */
+   explicit DistributedArray( const DistributedArray& array );
 
-   void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   /**
+    * \brief Copy constructor with a specific allocator (makes a deep copy).
+    *
+    * \param array The array to be copied.
+    * \param allocator The allocator to be associated with this array.
+    */
+   explicit DistributedArray( const DistributedArray& array, const AllocatorType& allocator );
+
+   DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup(), const AllocatorType& allocator = AllocatorType() );
+
+   void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup() );
 
    const LocalRangeType& getLocalRange() const;
 
    IndexType getGhosts() const;
 
-   CommunicationGroup getCommunicationGroup() const;
+   MPI_Comm getCommunicationGroup() const;
+
+   AllocatorType getAllocator() const;
 
    /**
     * \brief Returns a modifiable view of the local part of the array.
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index 61dc3eda0..e9ee12093 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -22,8 +22,8 @@ namespace Containers {
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 ~DistributedArray()
 {
    // Wait for pending async operation, otherwise the synchronizer would crash
@@ -34,20 +34,43 @@ DistributedArray< Value, Device, Index, Communicator >::
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( const Allocator& allocator )
+: localData( allocator )
+{
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 DistributedArray( const DistributedArray& array )
 {
    setLike( array );
-   localData = array.getConstLocalViewWithGhosts();
+   view = array;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
-DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group )
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( const DistributedArray& array, const Allocator& allocator )
+: localData( allocator )
+{
+   setLike( array );
+   view = array;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, const Allocator& allocator )
+: localData( allocator )
 {
    setDistribution( localRange, ghosts, globalSize, group );
 }
@@ -55,13 +78,13 @@ DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalS
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
-setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group )
+DistributedArray< Value, Device, Index, Allocator >::
+setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group )
 {
    TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" );
-   if( group != Communicator::NullGroup )
+   if( group != MPI::NullGroup() )
       localData.setSize( localRange.getSize() + ghosts );
    view.bind( localRange, ghosts, globalSize, group, localData.getView() );
 }
@@ -69,9 +92,9 @@ setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSi
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 const Subrange< Index >&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getLocalRange() const
 {
    return view.getLocalRange();
@@ -80,9 +103,9 @@ getLocalRange() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 Index
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getGhosts() const
 {
    return view.getGhosts();
@@ -91,9 +114,9 @@ getGhosts() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename Communicator::CommunicationGroup
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+MPI_Comm
+DistributedArray< Value, Device, Index, Allocator >::
 getCommunicationGroup() const
 {
    return view.getCommunicationGroup();
@@ -102,9 +125,20 @@ getCommunicationGroup() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+Allocator
+DistributedArray< Value, Device, Index, Allocator >::
+getAllocator() const
+{
+   return localData.getAllocator();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getLocalView()
 {
    return view.getLocalView();
@@ -113,9 +147,9 @@ getLocalView()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getConstLocalView() const
 {
    return view.getConstLocalView();
@@ -124,9 +158,9 @@ getConstLocalView() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getLocalViewWithGhosts()
 {
    return view.getLocalViewWithGhosts();
@@ -135,9 +169,9 @@ getLocalViewWithGhosts()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getConstLocalViewWithGhosts() const
 {
    return view.getConstLocalViewWithGhosts();
@@ -147,9 +181,9 @@ getConstLocalViewWithGhosts() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 copyFromGlobal( ConstLocalViewType globalArray )
 {
    view.copyFromGlobal( globalArray );
@@ -158,9 +192,9 @@ copyFromGlobal( ConstLocalViewType globalArray )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
 {
    view.setSynchronizer( synchronizer, valuesPerElement );
@@ -169,9 +203,9 @@ setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPer
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType >
+DistributedArray< Value, Device, Index, Allocator >::
 getSynchronizer() const
 {
    return view.getSynchronizer();
@@ -180,9 +214,9 @@ getSynchronizer() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 int
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getValuesPerElement() const
 {
    return view.getValuesPerElement();
@@ -191,9 +225,9 @@ getValuesPerElement() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 startSynchronization()
 {
    view.startSynchronization();
@@ -202,9 +236,9 @@ startSynchronization()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 waitForSynchronization() const
 {
    view.waitForSynchronization();
@@ -218,9 +252,9 @@ waitForSynchronization() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getView()
 {
    return view;
@@ -229,9 +263,9 @@ getView()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getConstView() const
 {
    return view.getConstView();
@@ -240,8 +274,8 @@ getConstView() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 operator ViewType()
 {
    return getView();
@@ -250,8 +284,8 @@ operator ViewType()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 operator ConstViewType() const
 {
    return getConstView();
@@ -260,10 +294,10 @@ operator ConstViewType() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setLike( const Array& array )
 {
    localData.setLike( array.getConstLocalViewWithGhosts() );
@@ -276,9 +310,9 @@ setLike( const Array& array )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 reset()
 {
    view.reset();
@@ -288,9 +322,9 @@ reset()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 empty() const
 {
    return view.empty();
@@ -299,9 +333,9 @@ empty() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 Index
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getSize() const
 {
    return view.getSize();
@@ -310,9 +344,9 @@ getSize() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setValue( ValueType value )
 {
    view.setValue( value );
@@ -321,9 +355,9 @@ setValue( ValueType value )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setElement( IndexType i, ValueType value )
 {
    view.setElement( i, value );
@@ -332,9 +366,9 @@ setElement( IndexType i, ValueType value )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 Value
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getElement( IndexType i ) const
 {
    return view.getElement( i );
@@ -343,10 +377,10 @@ getElement( IndexType i ) const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 __cuda_callable__
 Value&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator[]( IndexType i )
 {
    return view[ i ];
@@ -355,10 +389,10 @@ operator[]( IndexType i )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 __cuda_callable__
 const Value&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator[]( IndexType i ) const
 {
    return view[ i ];
@@ -367,9 +401,9 @@ operator[]( IndexType i ) const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >&
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >&
+DistributedArray< Value, Device, Index, Allocator >::
 operator=( const DistributedArray& array )
 {
    setLike( array );
@@ -380,10 +414,10 @@ operator=( const DistributedArray& array )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array, typename..., typename >
-DistributedArray< Value, Device, Index, Communicator >&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >&
+DistributedArray< Value, Device, Index, Allocator >::
 operator=( const Array& array )
 {
    setLike( array );
@@ -394,10 +428,10 @@ operator=( const Array& array )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator==( const Array& array ) const
 {
    return view == array;
@@ -406,10 +440,10 @@ operator==( const Array& array ) const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator!=( const Array& array ) const
 {
    return view != array;
@@ -418,9 +452,9 @@ operator!=( const Array& array ) const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 containsValue( ValueType value ) const
 {
    return view.containsValue( value );
@@ -429,9 +463,9 @@ containsValue( ValueType value ) const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 containsOnlyValue( ValueType value ) const
 {
    return view.containsOnlyValue( value );
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 0a9aef1a4..cb3235ddb 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -15,30 +15,27 @@
 #include <memory>
 
 #include <TNL/Containers/ArrayView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/ByteArraySynchronizer.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename Value,
           typename Device = Devices::Host,
-          typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Index = int >
 class DistributedArrayView
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = Value;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
    using LocalRangeType = Subrange< Index >;
    using LocalViewType = Containers::ArrayView< Value, Device, Index >;
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
-   using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
-   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using ViewType = DistributedArrayView< Value, Device, Index >;
+   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >;
    using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >;
 
    /**
@@ -46,15 +43,14 @@ public:
     */
    template< typename _Value,
              typename _Device = Device,
-             typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >;
+             typename _Index = Index >
+   using Self = DistributedArrayView< _Value, _Device, _Index >;
 
 
    ~DistributedArrayView();
 
    // Initialization by raw data
-   DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
+   DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData )
    : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData)
    {
       TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
@@ -69,13 +65,13 @@ public:
 
    // "Templated copy-constructor" accepting any cv-qualification of Value
    template< typename Value_ >
-   DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& );
+   DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& );
 
    // default move-constructor
    DistributedArrayView( DistributedArrayView&& ) = default;
 
    // method for rebinding (reinitialization) to raw data
-   void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData );
+   void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData );
 
    // Note that you can also bind directly to DistributedArray and other types implicitly
    // convertible to DistributedArrayView.
@@ -90,7 +86,7 @@ public:
 
    IndexType getGhosts() const;
 
-   CommunicationGroup getCommunicationGroup() const;
+   MPI_Comm getCommunicationGroup() const;
 
    LocalViewType getLocalView();
 
@@ -184,7 +180,7 @@ protected:
    LocalRangeType localRange;
    IndexType ghosts = 0;
    IndexType globalSize = 0;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    LocalViewType localData;
 
    std::shared_ptr< SynchronizerType > synchronizer = nullptr;
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 65654a54d..65ecc4101 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -19,9 +19,8 @@ namespace Containers {
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+DistributedArrayView< Value, Device, Index >::
 ~DistributedArrayView()
 {
    // Wait for pending async operation, otherwise the synchronizer might crash
@@ -33,11 +32,10 @@ DistributedArrayView< Value, Device, Index, Communicator >::
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Value_ >
-DistributedArrayView< Value, Device, Index, Communicator >::
-DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view )
+DistributedArrayView< Value, Device, Index >::
+DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& view )
 : localRange( view.getLocalRange() ),
   ghosts( view.getGhosts() ),
   globalSize( view.getSize() ),
@@ -49,11 +47,10 @@ DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communi
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
-bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
+DistributedArrayView< Value, Device, Index >::
+bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData )
 {
    TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
                   "The local array size does not match the local range of the distributed array." );
@@ -68,10 +65,9 @@ bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize,
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 bind( DistributedArrayView view )
 {
    localRange = view.getLocalRange();
@@ -86,11 +82,10 @@ bind( DistributedArrayView view )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Value_ >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 bind( Value_* data, IndexType localSize )
 {
    TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts,
@@ -100,10 +95,9 @@ bind( Value_* data, IndexType localSize )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 const Subrange< Index >&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getLocalRange() const
 {
    return localRange;
@@ -111,10 +105,9 @@ getLocalRange() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 Index
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getGhosts() const
 {
    return ghosts;
@@ -122,10 +115,9 @@ getGhosts() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename Communicator::CommunicationGroup
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+MPI_Comm
+DistributedArrayView< Value, Device, Index >::
 getCommunicationGroup() const
 {
    return group;
@@ -133,10 +125,9 @@ getCommunicationGroup() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::LocalViewType
+DistributedArrayView< Value, Device, Index >::
 getLocalView()
 {
    return LocalViewType( localData.getData(), localRange.getSize() );
@@ -144,10 +135,9 @@ getLocalView()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType
+DistributedArrayView< Value, Device, Index >::
 getConstLocalView() const
 {
    return ConstLocalViewType( localData.getData(), localRange.getSize() );
@@ -155,10 +145,9 @@ getConstLocalView() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::LocalViewType
+DistributedArrayView< Value, Device, Index >::
 getLocalViewWithGhosts()
 {
    return localData;
@@ -166,10 +155,9 @@ getLocalViewWithGhosts()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType
+DistributedArrayView< Value, Device, Index >::
 getConstLocalViewWithGhosts() const
 {
    return localData;
@@ -177,10 +165,9 @@ getConstLocalViewWithGhosts() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 copyFromGlobal( ConstLocalViewType globalArray )
 {
    TNL_ASSERT_EQ( getSize(), globalArray.getSize(),
@@ -200,10 +187,9 @@ copyFromGlobal( ConstLocalViewType globalArray )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
 {
    this->synchronizer = synchronizer;
@@ -212,10 +198,9 @@ setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPer
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-std::shared_ptr< typename DistributedArrayView< Value, Device, Index, Communicator >::SynchronizerType >
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType >
+DistributedArrayView< Value, Device, Index >::
 getSynchronizer() const
 {
    return synchronizer;
@@ -223,10 +208,9 @@ getSynchronizer() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 int
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getValuesPerElement() const
 {
    return valuesPerElement;
@@ -234,10 +218,9 @@ getValuesPerElement() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 startSynchronization()
 {
    if( ghosts == 0 )
@@ -255,10 +238,9 @@ startSynchronization()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 waitForSynchronization() const
 {
    if( synchronizer && synchronizer->async_op.valid() ) {
@@ -271,10 +253,9 @@ waitForSynchronization() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ViewType
+DistributedArrayView< Value, Device, Index >::
 getView()
 {
    return *this;
@@ -282,10 +263,9 @@ getView()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstViewType
+DistributedArrayView< Value, Device, Index >::
 getConstView() const
 {
    return *this;
@@ -293,25 +273,23 @@ getConstView() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 reset()
 {
    localRange.reset();
    ghosts = 0;
    globalSize = 0;
-   group = Communicator::NullGroup;
+   group = MPI::NullGroup();
    localData.reset();
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 empty() const
 {
    return getSize() == 0;
@@ -321,10 +299,9 @@ empty() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 Index
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getSize() const
 {
    return globalSize;
@@ -332,10 +309,9 @@ getSize() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 setValue( ValueType value )
 {
    localData.setValue( value );
@@ -344,10 +320,9 @@ setValue( ValueType value )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 setElement( IndexType i, ValueType value )
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -356,10 +331,9 @@ setElement( IndexType i, ValueType value )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 Value
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getElement( IndexType i ) const
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -368,11 +342,10 @@ getElement( IndexType i ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 __cuda_callable__
 Value&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator[]( IndexType i )
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -381,11 +354,10 @@ operator[]( IndexType i )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 __cuda_callable__
 const Value&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator[]( IndexType i ) const
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -394,10 +366,9 @@ operator[]( IndexType i ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
+          typename Index >
+DistributedArrayView< Value, Device, Index >&
+DistributedArrayView< Value, Device, Index >::
 operator=( const DistributedArrayView& view )
 {
    TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
@@ -413,11 +384,10 @@ operator=( const DistributedArrayView& view )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Array, typename..., typename >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >&
+DistributedArrayView< Value, Device, Index >::
 operator=( const Array& array )
 {
    TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
@@ -433,11 +403,10 @@ operator=( const Array& array )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Array >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator==( const Array& array ) const
 {
    // we can't run allreduce if the communication groups are different
@@ -450,18 +419,17 @@ operator==( const Array& array ) const
          // compare without ghosts
          getConstLocalView() == array.getConstLocalView();
    bool result = true;
-   if( group != CommunicatorType::NullGroup )
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+   if( group != MPI::NullGroup() )
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
    return result;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Array >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator!=( const Array& array ) const
 {
    return ! (*this == array);
@@ -469,32 +437,30 @@ operator!=( const Array& array ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 containsValue( ValueType value ) const
 {
    bool result = false;
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       const bool localResult = localData.containsValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group );
    }
    return result;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 containsOnlyValue( ValueType value ) const
 {
    bool result = true;
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       const bool localResult = localData.containsOnlyValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
    }
    return result;
 }
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 32dc80125..8d737e3a9 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -21,21 +21,20 @@ namespace Containers {
 template< typename Real,
           typename Device = Devices::Host,
           typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Real > >
 class DistributedVector
-: public DistributedArray< Real, Device, Index, Communicator >
+: public DistributedArray< Real, Device, Index, Allocator >
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using BaseType = DistributedArray< Real, Device, Index, Communicator >;
+   using BaseType = DistributedArray< Real, Device, Index, Allocator >;
 public:
    using RealType = Real;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
+   using AllocatorType = Allocator;
    using LocalViewType = Containers::VectorView< Real, Device, Index >;
    using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >;
-   using ViewType = DistributedVectorView< Real, Device, Index, Communicator >;
-   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >;
+   using ViewType = DistributedVectorView< Real, Device, Index >;
+   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref Vector type with changed template parameters.
@@ -43,8 +42,8 @@ public:
    template< typename _Real,
              typename _Device = Device,
              typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedVector< _Real, _Device, _Index, _Communicator >;
+             typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Real > >
+   using Self = DistributedVector< _Real, _Device, _Index, _Allocator >;
 
 
    // inherit all constructors and assignment operators from Array
@@ -60,6 +59,11 @@ public:
     */
    explicit DistributedVector( const DistributedVector& ) = default;
 
+   /**
+    * \brief Copy constructor with a specific allocator (makes a deep copy).
+    */
+   explicit DistributedVector( const DistributedVector& vector, const AllocatorType& allocator );
+
    /**
     * \brief Default move constructor.
     */
@@ -177,8 +181,8 @@ public:
 
 // Enable expression templates for DistributedVector
 namespace Expressions {
-   template< typename Real, typename Device, typename Index, typename Communicator >
-   struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Communicator > >
+   template< typename Real, typename Device, typename Index, typename Allocator >
+   struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Allocator > >
    : std::true_type
    {};
 } // namespace Expressions
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index cbbc763ec..044b747d9 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -21,9 +21,19 @@ namespace Containers {
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Real, Device, Index, Allocator >::
+DistributedVector( const DistributedVector& vector, const AllocatorType& allocator )
+: BaseType::DistributedArray( vector, allocator )
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getLocalView()
 {
    return BaseType::getLocalView();
@@ -32,9 +42,9 @@ getLocalView()
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getConstLocalView() const
 {
    return BaseType::getConstLocalView();
@@ -43,9 +53,9 @@ getConstLocalView() const
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getLocalViewWithGhosts()
 {
    return BaseType::getLocalViewWithGhosts();
@@ -54,9 +64,9 @@ getLocalViewWithGhosts()
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getConstLocalViewWithGhosts() const
 {
    return BaseType::getConstLocalViewWithGhosts();
@@ -65,9 +75,9 @@ getConstLocalViewWithGhosts() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Value, Device, Index, Communicator >::ViewType
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Value, Device, Index, Allocator >::ViewType
+DistributedVector< Value, Device, Index, Allocator >::
 getView()
 {
    return BaseType::getView();
@@ -76,9 +86,9 @@ getView()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Value, Device, Index, Allocator >::ConstViewType
+DistributedVector< Value, Device, Index, Allocator >::
 getConstView() const
 {
    return BaseType::getConstView();
@@ -87,8 +97,8 @@ getConstView() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Value, Device, Index, Allocator >::
 operator ViewType()
 {
    return getView();
@@ -97,8 +107,8 @@ operator ViewType()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Value, Device, Index, Allocator >::
 operator ConstViewType() const
 {
    return getConstView();
@@ -112,10 +122,10 @@ operator ConstViewType() const
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator=( const Vector& vector )
 {
    this->setLike( vector );
@@ -126,10 +136,10 @@ operator=( const Vector& vector )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator+=( const Vector& vector )
 {
    getView() += vector;
@@ -139,10 +149,10 @@ operator+=( const Vector& vector )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator-=( const Vector& vector )
 {
    getView() -= vector;
@@ -152,10 +162,10 @@ operator-=( const Vector& vector )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator*=( const Vector& vector )
 {
    getView() *= vector;
@@ -165,10 +175,10 @@ operator*=( const Vector& vector )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator/=( const Vector& vector )
 {
    getView() /= vector;
@@ -178,10 +188,10 @@ operator/=( const Vector& vector )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator=( Scalar c )
 {
    getView() = c;
@@ -191,10 +201,10 @@ operator=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator+=( Scalar c )
 {
    getView() += c;
@@ -204,10 +214,10 @@ operator+=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator-=( Scalar c )
 {
    getView() -= c;
@@ -217,10 +227,10 @@ operator-=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator*=( Scalar c )
 {
    getView() *= c;
@@ -230,10 +240,10 @@ operator*=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator/=( Scalar c )
 {
    getView() /= c;
@@ -243,10 +253,10 @@ operator/=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< Algorithms::ScanType Type >
 void
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >::
 scan( IndexType begin, IndexType end )
 {
    getView().template scan< Type >( begin, end );
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 6be52d9db..4a46a47ce 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -21,32 +21,28 @@ namespace Containers {
 
 template< typename Real,
           typename Device = Devices::Host,
-          typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Index = int >
 class DistributedVectorView
-: public DistributedArrayView< Real, Device, Index, Communicator >
+: public DistributedArrayView< Real, Device, Index >
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using BaseType = DistributedArrayView< Real, Device, Index, Communicator >;
+   using BaseType = DistributedArrayView< Real, Device, Index >;
    using NonConstReal = typename std::remove_const< Real >::type;
 public:
    using RealType = Real;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
    using LocalViewType = Containers::VectorView< Real, Device, Index >;
    using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >;
-   using ViewType = DistributedVectorView< Real, Device, Index, Communicator >;
-   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >;
+   using ViewType = DistributedVectorView< Real, Device, Index >;
+   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref VectorView type with changed template parameters.
     */
    template< typename _Real,
              typename _Device = Device,
-             typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedVectorView< _Real, _Device, _Index, _Communicator >;
+             typename _Index = Index >
+   using Self = DistributedVectorView< _Real, _Device, _Index >;
 
 
    // inherit all constructors and assignment operators from ArrayView
@@ -62,7 +58,7 @@ public:
 
    // initialization by base class is not a copy constructor so it has to be explicit
    template< typename Real_ >  // template catches both const and non-const qualified Element
-   DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view )
+   DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index >& view )
    : BaseType( view ) {}
 
    /**
@@ -156,8 +152,8 @@ public:
 
 // Enable expression templates for DistributedVector
 namespace Expressions {
-   template< typename Real, typename Device, typename Index, typename Communicator >
-   struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index, Communicator > >
+   template< typename Real, typename Device, typename Index >
+   struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index > >
    : std::true_type
    {};
 } // namespace Expressions
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index f1a6fb1e5..2f9222f94 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -20,10 +20,9 @@ namespace Containers {
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::LocalViewType
+DistributedVectorView< Real, Device, Index >::
 getLocalView()
 {
    return BaseType::getLocalView();
@@ -31,10 +30,9 @@ getLocalView()
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType
+DistributedVectorView< Real, Device, Index >::
 getConstLocalView() const
 {
    return BaseType::getConstLocalView();
@@ -42,10 +40,9 @@ getConstLocalView() const
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::LocalViewType
+DistributedVectorView< Real, Device, Index >::
 getLocalViewWithGhosts()
 {
    return BaseType::getLocalViewWithGhosts();
@@ -53,10 +50,9 @@ getLocalViewWithGhosts()
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType
+DistributedVectorView< Real, Device, Index >::
 getConstLocalViewWithGhosts() const
 {
    return BaseType::getConstLocalViewWithGhosts();
@@ -64,10 +60,9 @@ getConstLocalViewWithGhosts() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType
-DistributedVectorView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Value, Device, Index >::ViewType
+DistributedVectorView< Value, Device, Index >::
 getView()
 {
    return *this;
@@ -75,10 +70,9 @@ getView()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType
-DistributedVectorView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Value, Device, Index >::ConstViewType
+DistributedVectorView< Value, Device, Index >::
 getConstView() const
 {
    return *this;
@@ -91,11 +85,10 @@ getConstView() const
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
@@ -107,7 +100,7 @@ operator=( const Vector& vector )
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "The communication groups of the array views must be equal." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       // TODO: it might be better to split the local and ghost parts and synchronize in the middle
       this->waitForSynchronization();
       vector.waitForSynchronization();
@@ -118,11 +111,10 @@ operator=( const Vector& vector )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator+=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
@@ -134,7 +126,7 @@ operator+=( const Vector& vector )
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       // TODO: it might be better to split the local and ghost parts and synchronize in the middle
       this->waitForSynchronization();
       vector.waitForSynchronization();
@@ -145,11 +137,10 @@ operator+=( const Vector& vector )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator-=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
@@ -161,7 +152,7 @@ operator-=( const Vector& vector )
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       // TODO: it might be better to split the local and ghost parts and synchronize in the middle
       this->waitForSynchronization();
       vector.waitForSynchronization();
@@ -172,11 +163,10 @@ operator-=( const Vector& vector )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator*=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
@@ -188,7 +178,7 @@ operator*=( const Vector& vector )
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       // TODO: it might be better to split the local and ghost parts and synchronize in the middle
       this->waitForSynchronization();
       vector.waitForSynchronization();
@@ -199,11 +189,10 @@ operator*=( const Vector& vector )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator/=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
@@ -215,7 +204,7 @@ operator/=( const Vector& vector )
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       // TODO: it might be better to split the local and ghost parts and synchronize in the middle
       this->waitForSynchronization();
       vector.waitForSynchronization();
@@ -226,14 +215,13 @@ operator/=( const Vector& vector )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() = c;
       this->startSynchronization();
    }
@@ -242,14 +230,13 @@ operator=( Scalar c )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator+=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() += c;
       this->startSynchronization();
    }
@@ -258,14 +245,13 @@ operator+=( Scalar c )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator-=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() -= c;
       this->startSynchronization();
    }
@@ -274,14 +260,13 @@ operator-=( Scalar c )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator*=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() *= c;
       this->startSynchronization();
    }
@@ -290,14 +275,13 @@ operator*=( Scalar c )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator/=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() /= c;
       this->startSynchronization();
    }
@@ -306,11 +290,10 @@ operator/=( Scalar c )
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< Algorithms::ScanType Type >
 void
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >::
 scan( IndexType begin, IndexType end )
 {
    if( end == 0 )
diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h
index 2695ccccc..10bf2d117 100644
--- a/src/TNL/Containers/Expressions/DistributedComparison.h
+++ b/src/TNL/Containers/Expressions/DistributedComparison.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/MPI/DummyDefs.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -43,8 +43,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
             // compare without ghosts
             a.getConstLocalView() == b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -64,8 +64,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult = a.getConstLocalView() < b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -80,8 +80,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult = a.getConstLocalView() <= b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -96,8 +96,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult = a.getConstLocalView() > b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -112,8 +112,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult = a.getConstLocalView() >= b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 };
@@ -128,8 +128,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a == b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -142,8 +142,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a < b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -151,8 +151,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a <= b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -160,8 +160,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a > b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -169,8 +169,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a >= b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 };
@@ -185,8 +185,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() == b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -199,8 +199,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() < b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -208,8 +208,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() <= b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -217,8 +217,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() > b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -226,8 +226,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() >= b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 };
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 25175a467..5f67084fd 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -59,8 +59,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType,
                                                         typename T2::ConstLocalViewType,
@@ -115,7 +113,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getGhosts();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
    }
@@ -159,8 +157,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >;
    using SynchronizerType = typename T1::SynchronizerType;
@@ -199,7 +195,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getGhosts();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
    }
@@ -242,8 +238,6 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
    using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
-   using CommunicatorType = typename T2::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T2::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >;
    using SynchronizerType = typename T2::SynchronizerType;
@@ -282,7 +276,7 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
       return op2.getGhosts();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return op2.getCommunicationGroup();
    }
@@ -326,8 +320,6 @@ struct DistributedUnaryExpressionTemplate
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >;
    using SynchronizerType = typename T1::SynchronizerType;
@@ -366,7 +358,7 @@ struct DistributedUnaryExpressionTemplate
       return operand.getGhosts();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return operand.getCommunicationGroup();
    }
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index f55ae3d4a..903df1e1d 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/VerticalOperations.h>
-#include <TNL/MPI/DummyDefs.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -21,14 +21,13 @@ template< typename Expression >
 auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -40,26 +39,25 @@ auto DistributedExpressionArgMin( const Expression& expression )
    using RealType = std::decay_t< decltype( expression[0] ) >;
    using IndexType = typename Expression::IndexType;
    using ResultType = std::pair< RealType, IndexType >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< RealType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's real type" );
    ResultType result( -1, std::numeric_limits< RealType >::max() );
    const auto group = expression.getCommunicationGroup();
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       // compute local argMin
       ResultType localResult = ExpressionArgMin( expression.getConstLocalView() );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
       // scatter local result to all processes and gather their results
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       ResultType dataForScatter[ nproc ];
       for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult;
       ResultType gatheredResults[ nproc ];
       // NOTE: exchanging general data types does not work with MPI
-      //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
-      CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
+      //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
+      MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
 
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
@@ -82,14 +80,13 @@ template< typename Expression >
 auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::lowest();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -101,26 +98,25 @@ auto DistributedExpressionArgMax( const Expression& expression )
    using RealType = std::decay_t< decltype( expression[0] ) >;
    using IndexType = typename Expression::IndexType;
    using ResultType = std::pair< RealType, IndexType >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< RealType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's real type" );
    ResultType result( -1, std::numeric_limits< RealType >::lowest() );
    const auto group = expression.getCommunicationGroup();
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       // compute local argMax
       ResultType localResult = ExpressionArgMax( expression.getConstLocalView() );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
       // scatter local result to all processes and gather their results
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       ResultType dataForScatter[ nproc ];
       for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult;
       ResultType gatheredResults[ nproc ];
       // NOTE: exchanging general data types does not work with MPI
-      //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
-      CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
+      //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
+      MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
 
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
@@ -143,12 +139,11 @@ template< typename Expression >
 auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionSum( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -157,12 +152,11 @@ template< typename Expression >
 auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_t< decltype( expression[0] * expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 1;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionProduct( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -171,14 +165,13 @@ template< typename Expression >
 auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] && expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -187,12 +180,11 @@ template< typename Expression >
 auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] || expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] || expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -201,14 +193,13 @@ template< typename Expression >
 auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -217,12 +208,11 @@ template< typename Expression >
 auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] | expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
    }
    return result;
 }
diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h
index c2dce9e34..6d3605b5a 100644
--- a/src/TNL/Containers/Partitioner.h
+++ b/src/TNL/Containers/Partitioner.h
@@ -22,18 +22,17 @@
 namespace TNL {
 namespace Containers {
 
-template< typename Index, typename Communicator = Communicators::MpiCommunicator >
+template< typename Index >
 class Partitioner
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using SubrangeType = Subrange< Index >;
 
-   static SubrangeType splitRange( Index globalSize, CommunicationGroup group )
+   static SubrangeType splitRange( Index globalSize, MPI_Comm group )
    {
-      if( group != Communicator::NullGroup ) {
-         const int rank = Communicator::GetRank( group );
-         const int partitions = Communicator::GetSize( group );
+      if( group != MPI::NullGroup() ) {
+         const int rank = MPI::GetRank( group );
+         const int partitions = MPI::GetSize( group );
          const Index begin = TNL::min( globalSize, rank * globalSize / partitions );
          const Index end = TNL::min( globalSize, (rank + 1) * globalSize / partitions );
          return SubrangeType( begin, end );
@@ -78,7 +77,7 @@ public:
 
       SubrangeType localRange;
       int overlaps;
-      CommunicationGroup group;
+      MPI_Comm group;
 
    public:
       using ByteArrayView = typename Base::ByteArrayView;
@@ -93,14 +92,14 @@ public:
 
       ArraySynchronizer() = delete;
 
-      ArraySynchronizer( SubrangeType localRange, int overlaps, CommunicationGroup group )
+      ArraySynchronizer( SubrangeType localRange, int overlaps, MPI_Comm group )
       : localRange(localRange), overlaps(overlaps), group(group)
       {}
 
       virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
       {
          auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
-         Communicator::WaitAll( requests.data(), requests.size() );
+         MPI::Waitall( requests.data(), requests.size() );
       }
 
       virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
@@ -108,30 +107,30 @@ public:
          TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps),
                         "unexpected array size" );
 
-         const int rank = Communicator::GetRank( group );
-         const int nproc = Communicator::GetSize( group );
+         const int rank = MPI::GetRank( group );
+         const int nproc = MPI::GetSize( group );
          const int left = (rank > 0) ? rank - 1 : nproc - 1;
          const int right = (rank < nproc - 1) ? rank + 1 : 0;
 
          // buffer for asynchronous communication requests
-         std::vector< typename Communicator::Request > requests;
+         std::vector< MPI_Request > requests;
 
          // issue all async receive operations
-         requests.push_back( Communicator::IRecv(
+         requests.push_back( MPI::Irecv(
                   array.getData() + bytesPerValue * localRange.getSize(),
                   bytesPerValue * overlaps,
                   left, 0, group ) );
-         requests.push_back( Communicator::IRecv(
+         requests.push_back( MPI::Irecv(
                   array.getData() + bytesPerValue * (localRange.getSize() + overlaps),
                   bytesPerValue * overlaps,
                   right, 0, group ) );
 
          // issue all async send operations
-         requests.push_back( Communicator::ISend(
+         requests.push_back( MPI::Isend(
                   array.getData(),
                   bytesPerValue * overlaps,
                   left, 0, group ) );
-         requests.push_back( Communicator::ISend(
+         requests.push_back( MPI::Isend(
                   array.getData() + bytesPerValue * (localRange.getSize() - overlaps),
                   bytesPerValue * overlaps,
                   right, 0, group ) );
diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index 5731d11ca..61e4eabb6 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -14,7 +14,6 @@
 
 #include <type_traits>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Containers/DistributedVectorView.h>
@@ -23,58 +22,39 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename T, typename R = void >
-struct enable_if_type
-{
-   using type = R;
-};
-
-template< typename T, typename Enable = void >
-struct has_communicator : std::false_type {};
-
-template< typename T >
-struct has_communicator< T, typename enable_if_type< typename T::CommunicatorType >::type >
-: std::true_type
-{};
-
-
 // TODO: 2D distribution for dense matrices (maybe it should be in different template,
 //       because e.g. setRowFast doesn't make sense for dense matrices)
-template< typename Matrix,
-          typename Communicator = Communicators::MpiCommunicator >
+template< typename Matrix >
 class DistributedMatrix
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using MatrixType = Matrix;
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
-   using CommunicatorType = Communicator;
    using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >;
 
-   using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >;
+   using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType >;
 
    using MatrixRow = typename Matrix::RowView;
    using ConstMatrixRow = typename Matrix::ConstRowView;
 
    template< typename _Real = RealType,
              typename _Device = DeviceType,
-             typename _Index = IndexType,
-             typename _Communicator = Communicator >
-   using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index >, _Communicator >;
+             typename _Index = IndexType >
+   using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index > >;
 
    DistributedMatrix() = default;
 
    DistributedMatrix( DistributedMatrix& ) = default;
 
-   DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup );
+   DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() );
 
-   void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup );
+   void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() );
 
    const LocalRangeType& getLocalRowRange() const;
 
-   CommunicationGroup getCommunicationGroup() const;
+   MPI_Comm getCommunicationGroup() const;
 
    const Matrix& getLocalMatrix() const;
 
@@ -124,7 +104,7 @@ public:
    // multiplication with a global vector
    template< typename InVector,
              typename OutVector >
-   typename std::enable_if< ! has_communicator< InVector >::value >::type
+   typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type
    vectorProduct( const InVector& inVector,
                   OutVector& outVector ) const;
 
@@ -135,7 +115,7 @@ public:
    // (not const because it modifies internal bufers)
    template< typename InVector,
              typename OutVector >
-   typename std::enable_if< has_communicator< InVector >::value >::type
+   typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type
    vectorProduct( const InVector& inVector,
                   OutVector& outVector ) const;
 
@@ -149,10 +129,10 @@ public:
 protected:
    LocalRangeType localRowRange;
    IndexType rows = 0;  // global rows count
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    Matrix localMatrix;
 
-   DistributedSpMV< Matrix, Communicator > spmv;
+   DistributedSpMV< Matrix > spmv;
 };
 
 } // namespace Matrices
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index b9638e002..8bc5d0982 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -17,60 +17,54 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename Matrix,
-          typename Communicator >
-DistributedMatrix< Matrix, Communicator >::
-DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group )
+template< typename Matrix >
+DistributedMatrix< Matrix >::
+DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group )
 {
    setDistribution( localRowRange, rows, columns, group );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
-setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group )
+DistributedMatrix< Matrix >::
+setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group )
 {
    this->localRowRange = localRowRange;
    this->rows = rows;
    this->group = group;
-   if( group != Communicator::NullGroup )
+   if( group != MPI::NullGroup() )
       localMatrix.setDimensions( localRowRange.getSize(), columns );
 
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 const Containers::Subrange< typename Matrix::IndexType >&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalRowRange() const
 {
    return localRowRange;
 }
 
-template< typename Matrix,
-          typename Communicator >
-typename Communicator::CommunicationGroup
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+MPI_Comm
+DistributedMatrix< Matrix >::
 getCommunicationGroup() const
 {
    return group;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 const Matrix&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalMatrix() const
 {
    return localMatrix;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 Matrix&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalMatrix()
 {
    return localMatrix;
@@ -81,10 +75,9 @@ getLocalMatrix()
  * Some common Matrix methods follow below.
  */
 
-template< typename Matrix,
-          typename Communicator >
-DistributedMatrix< Matrix, Communicator >&
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+DistributedMatrix< Matrix >&
+DistributedMatrix< Matrix >::
 operator=( const DistributedMatrix& matrix )
 {
    setLike( matrix );
@@ -92,11 +85,10 @@ operator=( const DistributedMatrix& matrix )
    return *this;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename MatrixT >
-DistributedMatrix< Matrix, Communicator >&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >&
+DistributedMatrix< Matrix >::
 operator=( const MatrixT& matrix )
 {
    setLike( matrix );
@@ -104,11 +96,10 @@ operator=( const MatrixT& matrix )
    return *this;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename MatrixT >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setLike( const MatrixT& matrix )
 {
    localRowRange = matrix.getLocalRowRange();
@@ -119,84 +110,77 @@ setLike( const MatrixT& matrix )
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 reset()
 {
    localRowRange.reset();
    rows = 0;
-   group = Communicator::NullGroup;
+   group = MPI::NullGroup();
    localMatrix.reset();
 
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getRows() const
 {
    return rows;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getColumns() const
 {
    return localMatrix.getColumns();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename RowCapacitiesVector >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setRowCapacities( const RowCapacitiesVector& rowCapacities )
 {
    TNL_ASSERT_EQ( rowCapacities.getSize(), getRows(), "row lengths vector has wrong size" );
    TNL_ASSERT_EQ( rowCapacities.getLocalRange(), getLocalRowRange(), "row lengths vector has wrong distribution" );
    TNL_ASSERT_EQ( rowCapacities.getCommunicationGroup(), getCommunicationGroup(), "row lengths vector has wrong communication group" );
 
-   if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( getCommunicationGroup() != MPI::NullGroup() ) {
       localMatrix.setRowCapacities( rowCapacities.getConstLocalView() );
 
       spmv.reset();
    }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename Vector >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( getCommunicationGroup() != MPI::NullGroup() ) {
       rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() );
       auto localRowLengths = rowLengths.getLocalView();
       localMatrix.getCompressedRowLengths( localRowLengths );
    }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getRowCapacity( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRowCapacity( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setElement( IndexType row,
             IndexType column,
             RealType value )
@@ -205,10 +189,9 @@ setElement( IndexType row,
    localMatrix.setElement( localRow, column, value );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::RealType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getElement( IndexType row,
             IndexType column ) const
 {
@@ -216,10 +199,9 @@ getElement( IndexType row,
    return localMatrix.getElement( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::RealType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getElementFast( IndexType row,
                 IndexType column ) const
 {
@@ -227,32 +209,29 @@ getElementFast( IndexType row,
    return localMatrix.getElementFast( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
-typename DistributedMatrix< Matrix, Communicator >::MatrixRow
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+typename DistributedMatrix< Matrix >::MatrixRow
+DistributedMatrix< Matrix >::
 getRow( IndexType row )
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRow( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
-typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+typename DistributedMatrix< Matrix >::ConstMatrixRow
+DistributedMatrix< Matrix >::
 getRow( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRow( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename InVector,
              typename OutVector >
-typename std::enable_if< ! has_communicator< InVector >::value >::type
-DistributedMatrix< Matrix, Communicator >::
+typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type
+DistributedMatrix< Matrix >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector ) const
 {
@@ -265,23 +244,21 @@ vectorProduct( const InVector& inVector,
    localMatrix.vectorProduct( inVector, outView );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 updateVectorProductCommunicationPattern()
 {
-   if( getCommunicationGroup() == CommunicatorType::NullGroup )
+   if( getCommunicationGroup() == MPI::NullGroup() )
       return;
    spmv.updateCommunicationPattern( getLocalMatrix(), getCommunicationGroup() );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename InVector,
              typename OutVector >
-typename std::enable_if< has_communicator< InVector >::value >::type
-DistributedMatrix< Matrix, Communicator >::
+typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type
+DistributedMatrix< Matrix >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector ) const
 {
@@ -291,7 +268,7 @@ vectorProduct( const InVector& inVector,
    TNL_ASSERT_EQ( outVector.getLocalRange(), getLocalRowRange(), "output vector has wrong distribution" );
    TNL_ASSERT_EQ( outVector.getCommunicationGroup(), getCommunicationGroup(), "output vector has wrong communication group" );
 
-   if( getCommunicationGroup() == CommunicatorType::NullGroup )
+   if( getCommunicationGroup() == MPI::NullGroup() )
       return;
 
    if( inVector.getGhosts() == 0 ) {
@@ -314,11 +291,10 @@ vectorProduct( const InVector& inVector,
    }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename Vector1, typename Vector2 >
 bool
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 performSORIteration( const Vector1& b,
                      const IndexType row,
                      Vector2& x,
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 76aaa77fe..bea864ead 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -33,7 +33,7 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 class DistributedSpMV
 {
 public:
@@ -41,8 +41,6 @@ public:
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
-   using CommunicatorType = Communicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >;
 
    // - communication pattern: vector components whose indices are in the range
@@ -55,10 +53,10 @@ public:
    // - assembly of the i-th row involves traversal of the local matrix stored
    //   in the i-th process
    // - assembly of the full matrix needs all-to-all communication
-   void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, CommunicationGroup group )
+   void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, MPI_Comm group )
    {
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
       commPatternStarts.setDimensions( nproc, nproc );
       commPatternEnds.setDimensions( nproc, nproc );
 
@@ -67,9 +65,9 @@ public:
       {
          Containers::Array< IndexType, Devices::Host, int > sendbuf( nproc );
          sendbuf.setValue( localRowRange.getBegin() );
-         CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                     globalOffsets.getData(), 1,
-                                     group );
+         MPI::Alltoall( sendbuf.getData(), 1,
+                        globalOffsets.getData(), 1,
+                        group );
       }
       const auto globalOffsetsView = globalOffsets.getConstView();
       auto getOwner = [=] __cuda_callable__ ( IndexType global_idx ) -> int
@@ -150,12 +148,12 @@ public:
       }
 
       // assemble the commPattern* matrices
-      CommunicatorType::Alltoall( &preCommPatternStarts(0, 0), nproc,
-                                  &commPatternStarts(0, 0), nproc,
-                                  group );
-      CommunicatorType::Alltoall( &preCommPatternEnds(0, 0), nproc,
-                                  &commPatternEnds(0, 0), nproc,
-                                  group );
+      MPI::Alltoall( &preCommPatternStarts(0, 0), nproc,
+                     &commPatternStarts(0, 0), nproc,
+                     group );
+      MPI::Alltoall( &preCommPatternEnds(0, 0), nproc,
+                     &commPatternEnds(0, 0), nproc,
+                     group );
    }
 
    template< typename InVector,
@@ -164,10 +162,10 @@ public:
                        const MatrixType& localMatrix,
                        const LocalRangeType& localRowRange,
                        const InVector& inVector,
-                       CommunicationGroup group )
+                       MPI_Comm group )
    {
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // handle trivial case
       if( nproc == 1 ) {
@@ -190,14 +188,14 @@ public:
       TNL_ASSERT_EQ( globalBuffer.getSize(), localMatrix.getColumns(), "the global buffer size does not match the number of matrix columns" );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > commRequests;
+      std::vector< MPI_Request > commRequests;
 
       // send our data to all processes that need it
       for( int i = 0; i < commPatternStarts.getRows(); i++ ) {
          if( i == rank )
              continue;
          if( commPatternStarts( i, rank ) < commPatternEnds( i, rank ) )
-            commRequests.push_back( CommunicatorType::ISend(
+            commRequests.push_back( MPI::Isend(
                      inVector.getConstLocalView().getData() + commPatternStarts( i, rank ) - localRowRange.getBegin(),
                      commPatternEnds( i, rank ) - commPatternStarts( i, rank ),
                      i, 0, group ) );
@@ -208,7 +206,7 @@ public:
          if( j == rank )
              continue;
          if( commPatternStarts( rank, j ) < commPatternEnds( rank, j ) )
-            commRequests.push_back( CommunicatorType::IRecv(
+            commRequests.push_back( MPI::Irecv(
                      globalBuffer.getPointer( commPatternStarts( rank, j ) ),
                      commPatternEnds( rank, j ) - commPatternStarts( rank, j ),
                      j, 0, group ) );
@@ -217,7 +215,7 @@ public:
       // general variant
       if( localOnlySpan.first >= localOnlySpan.second ) {
          // wait for all communications to finish
-         CommunicatorType::WaitAll( commRequests.data(), commRequests.size() );
+         MPI::Waitall( commRequests.data(), commRequests.size() );
 
          // perform matrix-vector multiplication
          auto outVectorView = outVector.getLocalView();
@@ -231,7 +229,7 @@ public:
          localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
 
          // wait for all communications to finish
-         CommunicatorType::WaitAll( commRequests.data(), commRequests.size() );
+         MPI::Waitall( commRequests.data(), commRequests.size() );
 
          // finish the multiplication by adding the non-local entries
          localMatrix.vectorProduct( globalBuffer, outVectorView, 1.0, 0.0, 0, localOnlySpan.first );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
index f88e315cc..7c03dd7ce 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
@@ -42,12 +42,12 @@ protected:
    VectorType diagonal;
 };
 
-template< typename Matrix, typename Communicator >
-class Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >
-: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+class Diagonal< Matrices::DistributedMatrix< Matrix > >
+: public Preconditioner< Matrices::DistributedMatrix< Matrix > >
 {
 public:
-   using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >;
+   using MatrixType = Matrices::DistributedMatrix< Matrix >;
    using RealType = typename MatrixType::RealType;
    using DeviceType = typename MatrixType::DeviceType;
    using IndexType = typename MatrixType::IndexType;
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index d2227e57b..17746373a 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -53,9 +53,9 @@ solve( ConstVectorViewType b, VectorViewType x ) const
 }
 
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 void
-Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >::
+Diagonal< Matrices::DistributedMatrix< Matrix > >::
 update( const MatrixPointer& matrixPointer )
 {
    TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" );
@@ -87,9 +87,9 @@ update( const MatrixPointer& matrixPointer )
    }
 }
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 void
-Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >::
+Diagonal< Matrices::DistributedMatrix< Matrix > >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
    ConstLocalViewType diag_view( diagonal );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index 857d8a063..a4eb9e8aa 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -194,11 +194,11 @@ protected:
 #endif
 };
 
-template< typename Matrix, typename Communicator >
-class ILU0_impl< Matrices::DistributedMatrix< Matrix, Communicator >, double, Devices::Cuda, int >
-: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+class ILU0_impl< Matrices::DistributedMatrix< Matrix >, double, Devices::Cuda, int >
+: public Preconditioner< Matrices::DistributedMatrix< Matrix > >
 {
-   using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >;
+   using MatrixType = Matrices::DistributedMatrix< Matrix >;
 public:
    using RealType = double;
    using DeviceType = Devices::Cuda;
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 7a1879923..d98b78294 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -54,24 +54,21 @@ struct Traits
    static void waitForSynchronization( VectorViewType v ) {}
 };
 
-template< typename Matrix, typename Communicator >
-struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+struct Traits< Matrices::DistributedMatrix< Matrix > >
 {
    using VectorType = Containers::DistributedVector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
    using VectorViewType = Containers::DistributedVectorView
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
    using ConstVectorViewType = Containers::DistributedVectorView
          < std::add_const_t< typename Matrix::RealType >,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
 
    using LocalVectorType = Containers::Vector
          < typename Matrix::RealType,
@@ -87,12 +84,11 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
            typename Matrix::IndexType >;
 
    // compatibility wrappers for some DistributedMatrix methods
-   static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix, Communicator >& m )
-   { return m.getLocalMatrix(); }
+   static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getLocalMatrix(); }
    static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); }
    static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); }
 
-   static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
+   static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getCommunicationGroup(); }
    static void startSynchronization( VectorViewType v ) { v.startSynchronization(); }
    static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); }
 };
diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h
index 2afda7aad..63b8fc273 100644
--- a/src/TNL/TypeTraits.h
+++ b/src/TNL/TypeTraits.h
@@ -253,4 +253,21 @@ public:
     static constexpr bool value = type::value;
 };
 
+/**
+ * \brief Type trait for checking if T has getCommunicationGroup method.
+ */
+template< typename T >
+class HasGetCommunicationGroupMethod
+{
+private:
+    typedef char YesType[1];
+    typedef char NoType[2];
+
+    template< typename C > static YesType& test( decltype(std::declval< C >().getCommunicationGroup()) );
+    template< typename C > static NoType& test(...);
+
+public:
+    static constexpr bool value = ( sizeof( test< std::decay_t<T> >(0) ) == sizeof( YesType ) );
+};
+
 } //namespace TNL
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index f594a081b..e25739afe 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -9,7 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedArray.h>
 #include <TNL/Containers/Partitioner.h>
 
@@ -17,6 +16,7 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::MPI;
 
 /*
  * Light check of DistributedArray.
@@ -32,7 +32,6 @@ class DistributedArrayTest
 protected:
    using ValueType = typename DistributedArray::ValueType;
    using DeviceType = typename DistributedArray::DeviceType;
-   using CommunicatorType = typename DistributedArray::CommunicatorType;
    using IndexType = typename DistributedArray::IndexType;
    using DistributedArrayType = DistributedArray;
    using ArrayViewType = typename DistributedArrayType::LocalViewType;
@@ -40,12 +39,12 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
    DistributedArrayType distributedArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
 
    // some arbitrary even value (but must be 0 if not distributed)
    const int ghosts = (nproc > 1) ? 4 : 0;
@@ -53,10 +52,10 @@ protected:
    DistributedArrayTest()
    {
       using LocalRangeType = typename DistributedArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedArray.setDistribution( localRange, ghosts, globalSize, group );
 
-      using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >;
+      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
       distributedArray.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
 
       EXPECT_EQ( distributedArray.getLocalRange(), localRange );
@@ -67,10 +66,10 @@ protected:
 
 // types for which DistributedArrayTest is instantiated
 using DistributedArrayTypes = ::testing::Types<
-   DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >
+   DistributedArray< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
-   DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >
+   DistributedArray< double, Devices::Cuda, int >
 #endif
 >;
 
@@ -86,11 +85,9 @@ TYPED_TEST( DistributedArrayTest, checkLocalSizes )
 
 TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const int localSize = this->distributedArray.getLocalView().getSize();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedArray.getSize(), this->globalSize );
 }
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index 5a201980c..a90f09506 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -11,7 +11,6 @@
 
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Containers/DistributedVectorView.h>
 #include <TNL/Containers/Partitioner.h>
@@ -21,6 +20,7 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::MPI;
 
 /*
  * Light check of DistributedVector.
@@ -36,21 +36,20 @@ class DistributedVectorTest
 protected:
    using RealType = typename DistributedVector::RealType;
    using DeviceType = typename DistributedVector::DeviceType;
-   using CommunicatorType = typename DistributedVector::CommunicatorType;
    using IndexType = typename DistributedVector::IndexType;
    using DistributedVectorType = DistributedVector;
    using VectorViewType = typename DistributedVectorType::LocalViewType;
-   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >;
    using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >;
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
    DistributedVectorType v;
    DistributedVectorView v_view;
    HostDistributedVectorType v_host;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
 
    // should be small enough to have fast tests, but large enough to test
    // scan with multiple CUDA grids
@@ -62,11 +61,11 @@ protected:
    DistributedVectorTest()
    {
       using LocalRangeType = typename DistributedVector::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       v.setDistribution( localRange, ghosts, globalSize, group );
 
-      using Synchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< DeviceType >;
-      using HostSynchronizer = typename Partitioner< IndexType, CommunicatorType >::template ArraySynchronizer< Devices::Sequential >;
+      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
+      using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
       v.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
       v_view.setSynchronizer( v.getSynchronizer() );
       v_host.setSynchronizer( std::make_shared<HostSynchronizer>( localRange, ghosts / 2, group ) );
@@ -78,10 +77,10 @@ protected:
 
 // types for which DistributedVectorTest is instantiated
 using DistributedVectorTypes = ::testing::Types<
-   DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >
+   DistributedVector< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
-   DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >
+   DistributedVector< double, Devices::Cuda, int >
 #endif
 >;
 
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index b659beaea..b79b675cf 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -13,10 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -61,16 +61,13 @@ protected:
    using RightReal = std::remove_const_t< typename Right::RealType >;
 #ifndef STATIC_VECTOR
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename Left::CommunicatorType;
-      static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value,
-                     "CommunicatorType must be the same for both Left and Right vectors." );
-      using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >;
-      using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >;
+      using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType >;
+      using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType >;
 
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+      const MPI_Comm group = AllGroup();
 
-      const int rank = CommunicatorType::GetRank(group);
-      const int nproc = CommunicatorType::GetSize(group);
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
 
       // some arbitrary value (but must be 0 if not distributed)
       const int ghosts = (nproc > 1) ? 4 : 0;
@@ -98,8 +95,8 @@ protected:
 #else
    #ifdef DISTRIBUTED_VECTOR
       using LocalRangeType = typename LeftVector::LocalRangeType;
-      using Synchronizer = typename Partitioner< typename Left::IndexType, CommunicatorType >::template ArraySynchronizer< typename Left::DeviceType >;
-      const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group );
+      using Synchronizer = typename Partitioner< typename Left::IndexType >::template ArraySynchronizer< typename Left::DeviceType >;
+      const LocalRangeType localRange = Partitioner< typename Left::IndexType >::splitRange( size, group );
 
       _L1.setDistribution( localRange, ghosts, size, group );
       _L2.setDistribution( localRange, ghosts, size, group );
@@ -160,23 +157,23 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorPairs = ::testing::Types<
    #ifndef HAVE_CUDA
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >
+      Pair< DistributedVector<     int,   Devices::Host, int >,
+            DistributedVector<     short, Devices::Host, int > >,
+      Pair< DistributedVector<     int,   Devices::Host, int >,
+            DistributedVectorView< short, Devices::Host, int > >,
+      Pair< DistributedVectorView< int,   Devices::Host, int >,
+            DistributedVector<     short, Devices::Host, int > >,
+      Pair< DistributedVectorView< int,   Devices::Host, int >,
+            DistributedVectorView< short, Devices::Host, int > >
    #else
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >
+      Pair< DistributedVector<     int,   Devices::Cuda, int >,
+            DistributedVector<     short, Devices::Cuda, int > >,
+      Pair< DistributedVector<     int,   Devices::Cuda, int >,
+            DistributedVectorView< short, Devices::Cuda, int > >,
+      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
+            DistributedVector<     short, Devices::Cuda, int > >,
+      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
+            DistributedVectorView< short, Devices::Cuda, int > >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index b7e8a1b95..32f2d52ba 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -2,6 +2,7 @@
 
 #include <TNL/Math.h>
 #include <TNL/TypeTraits.h>
+#include <TNL/Devices/Host.h>
 
 template< typename Vector >
 void setLinearSequence( Vector& deviceVector )
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index 27422513b..485265e4e 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -13,10 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -51,15 +51,14 @@ protected:
 #else
    using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >;
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename VectorOrView::CommunicatorType;
-      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
-      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
 
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+      const MPI_Comm group = AllGroup();
 
-      const int rank = CommunicatorType::GetRank(group);
-      const int nproc = CommunicatorType::GetSize(group);
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
 
       // some arbitrary even value (but must be 0 if not distributed)
       const int ghosts = (nproc > 1) ? 4 : 0;
@@ -75,13 +74,13 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorTypes = ::testing::Types<
    #ifndef HAVE_CUDA
-      DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >
+      DistributedVector<           double, Devices::Host, int >,
+      DistributedVectorView<       double, Devices::Host, int >,
+      DistributedVectorView< const double, Devices::Host, int >
    #else
-      DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >
+      DistributedVector<           double, Devices::Cuda, int >,
+      DistributedVectorView<       double, Devices::Cuda, int >,
+      DistributedVectorView< const double, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -174,10 +173,9 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
    #define SETUP_UNARY_VECTOR_TEST( size ) \
       using VectorType = typename TestFixture::VectorType;     \
       using VectorOrView = typename TestFixture::VectorOrView; \
-      using CommunicatorType = typename VectorOrView::CommunicatorType; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \
-      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       VectorType _V1, _V2;                                     \
       _V1.setDistribution( localRange, this->ghosts, size, this->group ); \
@@ -199,10 +197,9 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       EXPECTED_VECTOR( TestFixture, function );                \
       using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \
       using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \
-      using CommunicatorType = typename VectorOrView::CommunicatorType; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, this->group ); \
-      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       HostVector _V1h;                                         \
       HostExpectedVector expected_h;                           \
@@ -282,11 +279,8 @@ void expect_vectors_near( const Left& _v1, const Right& _v2 )
    using LeftNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Left::RealType > >;
    using RightNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Right::RealType > >;
 #ifdef DISTRIBUTED_VECTOR
-   using CommunicatorType = typename Left::CommunicatorType;
-   static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value,
-                  "CommunicatorType must be the same for both Left and Right vectors." );
-   using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >;
-   using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >;
+   using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >;
+   using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >;
 #else
    using LeftVector = Vector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >;
    using RightVector = Vector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >;
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index 4ad0c8303..f73b502cc 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -13,10 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -52,15 +52,14 @@ protected:
 #else
    using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >;
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename VectorOrView::CommunicatorType;
-      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
-      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
 
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+      const MPI_Comm group = AllGroup();
 
-      const int rank = CommunicatorType::GetRank(group);
-      const int nproc = CommunicatorType::GetSize(group);
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
 
       // some arbitrary value (but must be 0 if not distributed)
       const int ghosts = (nproc > 1) ? 4 : 0;
@@ -84,8 +83,8 @@ protected:
 #else
    #ifdef DISTRIBUTED_VECTOR
       using LocalRangeType = typename VectorOrView::LocalRangeType;
-      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType, CommunicatorType >::template ArraySynchronizer< typename VectorOrView::DeviceType >;
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group );
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >;
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, group );
       _V1.setDistribution( localRange, ghosts, size, group );
       _V1.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
    #else
@@ -111,13 +110,13 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorTypes = ::testing::Types<
    #ifndef HAVE_CUDA
-      DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >
+      DistributedVector<           double, Devices::Host, int >,
+      DistributedVectorView<       double, Devices::Host, int >,
+      DistributedVectorView< const double, Devices::Host, int >
    #else
-      DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >
+      DistributedVector<           double, Devices::Cuda, int >,
+      DistributedVectorView<       double, Devices::Cuda, int >,
+      DistributedVectorView< const double, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index 4cc584672..5e893e111 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -9,12 +9,12 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Matrices/SparseMatrix.h>
 
 using namespace TNL;
+using namespace TNL::MPI;
 
 template< typename Vector >
 void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = 0 )
@@ -32,7 +32,7 @@ void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset =
 template< typename Matrix, typename RowCapacities >
 void setMatrix( Matrix& matrix, const RowCapacities& rowCapacities )
 {
-   using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential >, typename Matrix::CommunicatorType >;
+   using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential > >;
    using HostRowCapacities = typename RowCapacities::template Self< typename RowCapacities::RealType, TNL::Devices::Sequential >;
 
    HostMatrix hostMatrix;
@@ -65,20 +65,19 @@ class DistributedMatrixTest
 protected:
    using RealType = typename DistributedMatrix::RealType;
    using DeviceType = typename DistributedMatrix::DeviceType;
-   using CommunicatorType = typename DistributedMatrix::CommunicatorType;
    using IndexType = typename DistributedMatrix::IndexType;
    using DistributedMatrixType = DistributedMatrix;
 
    using RowCapacitiesVector = typename DistributedMatrixType::CompressedRowLengthsVector;
    using GlobalVector = Containers::Vector< RealType, DeviceType, IndexType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
 
    DistributedMatrixType matrix;
 
@@ -87,7 +86,7 @@ protected:
    DistributedMatrixTest()
    {
       using LocalRangeType = typename DistributedMatrix::LocalRangeType;
-      const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( globalSize, group );
       matrix.setDistribution( localRange, globalSize, globalSize, group );
       rowCapacities.setDistribution( localRange, 0, globalSize, group );
 
@@ -100,10 +99,10 @@ protected:
 
 // types for which DistributedMatrixTest is instantiated
 using DistributedMatrixTypes = ::testing::Types<
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int > >
 #ifdef HAVE_CUDA
    ,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int > >
 #endif
 >;
 
@@ -111,11 +110,9 @@ TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes );
 
 TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const int localSize = this->matrix.getLocalMatrix().getRows();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->matrix.getRows(), this->globalSize );
 }
-- 
GitLab


From 6f74d8fa517830bda204e209fe2946bd16920ef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 10:48:06 +0100
Subject: [PATCH 44/50] MPI refactoring: removed MpiCommunicator from
 DistributedMesh

Also from DistributedMeshSynchronizer, PVTUReader and PVTUWriter
---
 src/TNL/Containers/ByteArraySynchronizer.h    |  6 +-
 .../DistributedMeshes/DistributedMesh.h       | 20 +++---
 .../DistributedMeshSynchronizer.h             | 52 +++++++--------
 .../DistributedMeshes/distributeSubentities.h | 65 +++++++++----------
 src/TNL/Meshes/Readers/PVTUReader.h           | 13 ++--
 src/TNL/Meshes/Writers/PVTUWriter.h           |  3 +-
 src/TNL/Meshes/Writers/PVTUWriter.hpp         |  7 +-
 src/Tools/tnl-game-of-life.cpp                | 20 +++---
 src/Tools/tnl-test-distributed-mesh.h         | 22 +++----
 .../DistributedMeshes/DistributedMeshTest.h   | 56 ++++++++--------
 10 files changed, 124 insertions(+), 140 deletions(-)

diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h
index e25260909..0bfed4d92 100644
--- a/src/TNL/Containers/ByteArraySynchronizer.h
+++ b/src/TNL/Containers/ByteArraySynchronizer.h
@@ -17,7 +17,7 @@
 #include <async/threadpool.h>
 
 #include <TNL/Containers/ArrayView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Timer.h>
 
 namespace TNL {
@@ -42,7 +42,7 @@ private:
 
 public:
    using ByteArrayView = ArrayView< std::uint8_t, Device, Index >;
-   using RequestsVector = std::vector< typename Communicators::MpiCommunicator::Request >;
+   using RequestsVector = std::vector< MPI_Request >;
 
    enum class AsyncPolicy {
       synchronous,
@@ -105,7 +105,7 @@ public:
          // immediate start, deferred synchronization (but still in the same thread)
          auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
          auto worker = [requests] () mutable {
-            Communicators::MpiCommunicator::WaitAll( requests.data(), requests.size() );
+            MPI::Waitall( requests.data(), requests.size() );
          };
          this->async_op = std::async( std::launch::deferred, worker );
       }
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
index 9a79f823d..21116d357 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
@@ -13,7 +13,7 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Meshes/DistributedMeshes/GlobalIndexStorage.h>
 #include <TNL/Meshes/MeshDetails/IndexPermutationApplier.h>
 
@@ -34,8 +34,6 @@ public:
    using PointType          = typename Mesh::PointType;
    using RealType           = typename PointType::RealType;
    using GlobalIndexArray   = typename Mesh::GlobalIndexArray;
-   using CommunicatorType   = Communicators::MpiCommunicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using VTKTypesArrayType  = Containers::Array< std::uint8_t, Devices::Sequential, GlobalIndexType >;
 
    DistributedMesh() = default;
@@ -101,12 +99,12 @@ public:
    /**
     * Methods specific to the distributed mesh
     */
-   void setCommunicationGroup( CommunicationGroup group )
+   void setCommunicationGroup( MPI_Comm group )
    {
       this->group = group;
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -190,10 +188,10 @@ public:
       const GlobalIndexType verticesCount = localMesh.template getEntitiesCount< 0 >();
       const GlobalIndexType cellsCount = localMesh.template getEntitiesCount< Mesh::getMeshDimension() >();
 
-      CommunicatorType::Barrier();
-      for( int i = 0; i < CommunicatorType::GetSize(); i++ ) {
-         if( i == CommunicatorType::GetRank() ) {
-            str << "MPI rank:\t" << CommunicatorType::GetRank() << "\n"
+      MPI::Barrier();
+      for( int i = 0; i < MPI::GetSize(); i++ ) {
+         if( i == MPI::GetRank() ) {
+            str << "MPI rank:\t" << MPI::GetRank() << "\n"
                 << "\tMesh dimension:\t" << getMeshDimension() << "\n"
                 << "\tCell topology:\t" << getType( typename Cell::EntityTopology{} ) << "\n"
                 << "\tCells count:\t" << cellsCount << "\n"
@@ -230,13 +228,13 @@ public:
             }
             str.flush();
          }
-         CommunicatorType::Barrier();
+         MPI::Barrier();
       }
    }
 
 protected:
    MeshType localMesh;
-   CommunicationGroup group = CommunicatorType::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    int ghostLevels = 0;
 
    // vtkGhostType arrays for points and cells (cached for output into VTK formats)
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 382de6905..36f28ba45 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/ByteArraySynchronizer.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Meshes {
@@ -40,7 +41,6 @@ class DistributedMeshSynchronizer
 public:
    using DeviceType = typename DistributedMesh::DeviceType;
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
-   using CommunicatorType = typename DistributedMesh::CommunicatorType;
    using ByteArrayView = typename Base::ByteArrayView;
    using RequestsVector = typename Base::RequestsVector;
 
@@ -61,8 +61,8 @@ public:
                      "Global indices are not allocated properly." );
 
       group = mesh.getCommunicationGroup();
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // exchange the global index offsets so that each rank can determine the
       // owner of every entity by its global index
@@ -71,9 +71,9 @@ public:
       {
          Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc );
          sendbuf.setValue( ownStart );
-         CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                     globalOffsets.getData(), 1,
-                                     group );
+         MPI::Alltoall( sendbuf.getData(), 1,
+                        globalOffsets.getData(), 1,
+                        group );
       }
 
       // count local ghost entities for each rank
@@ -110,9 +110,9 @@ public:
          for( int j = 0; j < nproc; j++ )
          for( int i = 0; i < nproc; i++ )
             sendbuf.setElement( j, i, localGhostCounts[ i ] );
-         CommunicatorType::Alltoall( &sendbuf(0, 0), nproc,
-                                     &ghostEntitiesCounts(0, 0), nproc,
-                                     group );
+         MPI::Alltoall( &sendbuf(0, 0), nproc,
+                        &ghostEntitiesCounts(0, 0), nproc,
+                        group );
       }
 
       // allocate ghost offsets
@@ -136,7 +136,7 @@ public:
          ghostOffsets[ 0 ] = ghostOffset;
          for( int i = 0; i < nproc; i++ ) {
             if( ghostEntitiesCounts( rank, i ) > 0 ) {
-               requests.push_back( CommunicatorType::ISend(
+               requests.push_back( MPI::Isend(
                         mesh.template getGlobalIndices< EntityDimension >().getData() + ghostOffset,
                         ghostEntitiesCounts( rank, i ),
                         i, 0, group ) );
@@ -151,7 +151,7 @@ public:
          // receive ghost indices from the neighboring ranks
          for( int j = 0; j < nproc; j++ ) {
             if( ghostEntitiesCounts( j, rank ) > 0 ) {
-               requests.push_back( CommunicatorType::IRecv(
+               requests.push_back( MPI::Irecv(
                         ghostNeighbors.getData() + ghostNeighborOffsets[ j ],
                         ghostEntitiesCounts( j, rank ),
                         j, 0, group ) );
@@ -159,7 +159,7 @@ public:
          }
 
          // wait for all communications to finish
-         CommunicatorType::WaitAll( requests.data(), requests.size() );
+         MPI::Waitall( requests.data(), requests.size() );
 
          // convert received ghost indices from global to local
          ghostNeighbors -= ownStart;
@@ -201,7 +201,7 @@ public:
    virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
    {
       auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
-      CommunicatorType::WaitAll( requests.data(), requests.size() );
+      MPI::Waitall( requests.data(), requests.size() );
    }
 
    virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
@@ -209,8 +209,8 @@ public:
       TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ],
                      "The array does not have the expected size." );
 
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // allocate send buffers (setSize does nothing if the array size is already correct)
       sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] );
@@ -221,7 +221,7 @@ public:
       // issue all receive async operations
       for( int j = 0; j < nproc; j++ ) {
          if( ghostEntitiesCounts( rank, j ) > 0 ) {
-            requests.push_back( CommunicatorType::IRecv(
+            requests.push_back( MPI::Irecv(
                      array.getData() + bytesPerValue * ghostOffsets[ j ],
                      bytesPerValue * ghostEntitiesCounts( rank, j ),
                      j, 0, group ) );
@@ -245,7 +245,7 @@ public:
             Algorithms::ParallelFor< DeviceType >::exec( (GlobalIndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset );
 
             // issue async send operation
-            requests.push_back( CommunicatorType::ISend(
+            requests.push_back( MPI::Isend(
                      sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ],
                      bytesPerValue * ghostEntitiesCounts( i, rank ),
                      i, 0, group ) );
@@ -268,8 +268,8 @@ public:
    {
       TNL_ASSERT_EQ( pattern.getRows(), ghostOffsets[ ghostOffsets.getSize() - 1 ], "invalid sparse pattern matrix" );
 
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // buffer for asynchronous communication requests
       RequestsVector requests;
@@ -306,7 +306,7 @@ public:
             // send our row sizes to the target rank
             if( ! assumeConsistentRowCapacities )
                // issue async send operation
-               requests.push_back( CommunicatorType::ISend(
+               requests.push_back( MPI::Isend(
                         send_rowCapacities.getData() + send_rankOffsets[ i ],
                         ghostNeighborOffsets[ i + 1 ] - ghostNeighborOffsets[ i ],
                         i, 1, group ) );
@@ -334,7 +334,7 @@ public:
             if( send_rankOffsets[ i + 1 ] == send_rankOffsets[ i ] )
                continue;
             // issue async send operation
-            requests.push_back( CommunicatorType::ISend(
+            requests.push_back( MPI::Isend(
                      send_columnIndices.getData() + send_rowPointers[ send_rankOffsets[ i ] ],
                      send_rowPointers[ send_rankOffsets[ i + 1 ] ] - send_rowPointers[ send_rankOffsets[ i ] ],
                      i, 0, group ) );
@@ -369,7 +369,7 @@ public:
             else {
                // receive row sizes from the sender
                // issue async recv operation
-               row_lengths_requests.push_back( CommunicatorType::IRecv(
+               row_lengths_requests.push_back( MPI::Irecv(
                         recv_rowPointers.getData() + recv_rankOffsets[ i ],
                         ghostOffsets[ i + 1 ] - ghostOffsets[ i ],
                         i, 1, group ) );
@@ -378,7 +378,7 @@ public:
 
          if( ! assumeConsistentRowCapacities ) {
             // wait for all row lengths
-            CommunicatorType::WaitAll( row_lengths_requests.data(), row_lengths_requests.size() );
+            MPI::Waitall( row_lengths_requests.data(), row_lengths_requests.size() );
 
             // scan the rowPointers array to convert
             Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView;
@@ -393,7 +393,7 @@ public:
             if( recv_rankOffsets[ i + 1 ] == recv_rankOffsets[ i ] )
                continue;
             // issue async recv operation
-            requests.push_back( CommunicatorType::IRecv(
+            requests.push_back( MPI::Irecv(
                      recv_columnIndices.getData() + recv_rowPointers[ recv_rankOffsets[ i ] ],
                      recv_rowPointers[ recv_rankOffsets[ i + 1 ] ] - recv_rowPointers[ recv_rankOffsets[ i ] ],
                      i, 0, group ) );
@@ -401,7 +401,7 @@ public:
       }
 
       // wait for all communications to finish
-      CommunicatorType::WaitAll( requests.data(), requests.size() );
+      MPI::Waitall( requests.data(), requests.size() );
 
       return std::make_tuple( recv_rankOffsets, recv_rowPointers, recv_columnIndices );
    }
@@ -445,7 +445,7 @@ public:
 
 protected:
    // communication group taken from the distributed mesh
-   typename CommunicatorType::CommunicationGroup group;
+   MPI_Comm group;
 
    /**
     * Global offsets: array of size nproc where the i-th value is the lowest
diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
index 63a10b1cf..120cadf80 100644
--- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
+++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
@@ -19,14 +19,14 @@ namespace TNL {
 namespace Meshes {
 namespace DistributedMeshes {
 
-template< typename CommunicatorType, typename GlobalIndexType >
+template< typename GlobalIndexType >
 auto
-exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
+exchangeGhostEntitySeeds( MPI_Comm group,
                           const std::vector< std::vector< GlobalIndexType > >& seeds_vertex_indices,
                           const std::vector< std::vector< GlobalIndexType > >& seeds_entity_offsets )
 {
-   const int rank = CommunicatorType::GetRank( group );
-   const int nproc = CommunicatorType::GetSize( group );
+   const int rank = MPI::GetRank( group );
+   const int nproc = MPI::GetSize( group );
 
    // exchange sizes of the arrays
    Containers::Array< GlobalIndexType, Devices::Host, int > sizes_vertex_indices( nproc ), sizes_entity_offsets( nproc );
@@ -36,12 +36,12 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
          sendbuf_indices[ i ] = seeds_vertex_indices[ i ].size();
          sendbuf_offsets[ i ] = seeds_entity_offsets[ i ].size();
       }
-      CommunicatorType::Alltoall( sendbuf_indices.getData(), 1,
-                                  sizes_vertex_indices.getData(), 1,
-                                  group );
-      CommunicatorType::Alltoall( sendbuf_offsets.getData(), 1,
-                                  sizes_entity_offsets.getData(), 1,
-                                  group );
+      MPI::Alltoall( sendbuf_indices.getData(), 1,
+                     sizes_vertex_indices.getData(), 1,
+                     group );
+      MPI::Alltoall( sendbuf_offsets.getData(), 1,
+                     sizes_entity_offsets.getData(), 1,
+                     group );
    }
 
    // allocate arrays for the results
@@ -54,17 +54,17 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
    }
 
    // buffer for asynchronous communication requests
-   std::vector< typename CommunicatorType::Request > requests;
+   std::vector< MPI_Request > requests;
 
    // issue all async receive operations
    for( int j = 0; j < nproc; j++ ) {
       if( j == rank )
           continue;
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                foreign_seeds_vertex_indices[ j ].data(),
                foreign_seeds_vertex_indices[ j ].size(),
                j, 0, group ) );
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                foreign_seeds_entity_offsets[ j ].data(),
                foreign_seeds_entity_offsets[ j ].size(),
                j, 1, group ) );
@@ -74,30 +74,30 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
    for( int i = 0; i < nproc; i++ ) {
       if( i == rank )
           continue;
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                seeds_vertex_indices[ i ].data(),
                seeds_vertex_indices[ i ].size(),
                i, 0, group ) );
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                seeds_entity_offsets[ i ].data(),
                seeds_entity_offsets[ i ].size(),
                i, 1, group ) );
    }
 
    // wait for all communications to finish
-   CommunicatorType::WaitAll( requests.data(), requests.size() );
+   MPI::Waitall( requests.data(), requests.size() );
 
    return std::make_tuple( foreign_seeds_vertex_indices, foreign_seeds_entity_offsets );
 }
 
-template< typename CommunicatorType, typename GlobalIndexType >
+template< typename GlobalIndexType >
 auto
-exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
+exchangeGhostIndices( MPI_Comm group,
                       const std::vector< std::vector< GlobalIndexType > >& foreign_ghost_indices,
                       const std::vector< std::vector< GlobalIndexType > >& seeds_local_indices )
 {
-   const int rank = CommunicatorType::GetRank( group );
-   const int nproc = CommunicatorType::GetSize( group );
+   const int rank = MPI::GetRank( group );
+   const int nproc = MPI::GetSize( group );
 
    // allocate arrays for the results
    std::vector< std::vector< GlobalIndexType > > ghost_indices;
@@ -106,13 +106,13 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
       ghost_indices[ i ].resize( seeds_local_indices[ i ].size() );
 
    // buffer for asynchronous communication requests
-   std::vector< typename CommunicatorType::Request > requests;
+   std::vector< MPI_Request > requests;
 
    // issue all async receive operations
    for( int j = 0; j < nproc; j++ ) {
       if( j == rank )
           continue;
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                ghost_indices[ j ].data(),
                ghost_indices[ j ].size(),
                j, 0, group ) );
@@ -122,14 +122,14 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
    for( int i = 0; i < nproc; i++ ) {
       if( i == rank )
           continue;
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                foreign_ghost_indices[ i ].data(),
                foreign_ghost_indices[ i ].size(),
                i, 0, group ) );
    }
 
    // wait for all communications to finish
-   CommunicatorType::WaitAll( requests.data(), requests.size() );
+   MPI::Waitall( requests.data(), requests.size() );
 
    return ghost_indices;
 }
@@ -145,7 +145,6 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
    using LocalIndexType = typename DistributedMesh::LocalIndexType;
    using LocalMesh = typename DistributedMesh::MeshType;
-   using CommunicatorType = typename DistributedMesh::CommunicatorType;
 
    static_assert( ! std::is_same< DeviceType, Devices::Cuda >::value,
                   "this method can be called only for host meshes" );
@@ -154,8 +153,8 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    if( mesh.getGhostLevels() <= 0 )
       throw std::logic_error( "There are no ghost levels on the distributed mesh." );
 
-   const int rank = CommunicatorType::GetRank( mesh.getCommunicationGroup() );
-   const int nproc = CommunicatorType::GetSize( mesh.getCommunicationGroup() );
+   const int rank = MPI::GetRank( mesh.getCommunicationGroup() );
+   const int nproc = MPI::GetSize( mesh.getCommunicationGroup() );
 
    // 0. exchange cell data to prepare getCellOwner for use in getEntityOwner
    DistributedMeshSynchronizer< DistributedMesh, DistributedMesh::getMeshDimension() > cell_synchronizer;
@@ -235,9 +234,9 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
 
       Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc );
       sendbuf.setValue( localEntitiesCount );
-      CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                  globalOffsets.getData(), 1,
-                                  mesh.getCommunicationGroup() );
+      MPI::Alltoall( sendbuf.getData(), 1,
+                     globalOffsets.getData(), 1,
+                     mesh.getCommunicationGroup() );
    }
    globalOffsets.template scan< Algorithms::ScanType::Exclusive >();
 
@@ -288,7 +287,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    }
 
    // 5. exchange seeds for ghost entities
-   const auto foreign_seeds = exchangeGhostEntitySeeds< CommunicatorType >( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets );
+   const auto foreign_seeds = exchangeGhostEntitySeeds( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets );
    const auto& foreign_seeds_vertex_indices = std::get< 0 >( foreign_seeds );
    const auto& foreign_seeds_entity_offsets = std::get< 1 >( foreign_seeds );
 
@@ -373,7 +372,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
       });
 
       // 6b. exchange global ghost indices
-      const auto ghost_indices = exchangeGhostIndices< CommunicatorType >( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices );
+      const auto ghost_indices = exchangeGhostIndices( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices );
 
       // 6c. set the global indices of our ghost entities
       bool done = true;
@@ -387,7 +386,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
 
       // 6d. check if finished
       bool all_done = false;
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
       if( all_done )
          break;
    }
diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h
index 393ee1551..725aa7fec 100644
--- a/src/TNL/Meshes/Readers/PVTUReader.h
+++ b/src/TNL/Meshes/Readers/PVTUReader.h
@@ -14,7 +14,7 @@
 
 #include <experimental/filesystem>
 
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Meshes/Readers/VTUReader.h>
 #include <TNL/Meshes/MeshDetails/layers/EntityTags/Traits.h>
 
@@ -67,13 +67,13 @@ class PVTUReader
          throw MeshReaderError( "PVTUReader", "the file does not contain any <Piece> element." );
 
       // check that the number of pieces matches the number of MPI ranks
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       if( (int) pieceSources.size() != nproc )
          throw MeshReaderError( "PVTUReader", "the number of subdomains does not match the number of MPI ranks ("
                                               + std::to_string(pieceSources.size()) + " vs " + std::to_string(nproc) + ")." );
 
       // read the local piece source
-      const int rank = CommunicatorType::GetRank( group );
+      const int rank = MPI::GetRank( group );
       localReader.setFileName( pieceSources[ rank ] );
       localReader.detectMesh();
 
@@ -100,12 +100,9 @@ class PVTUReader
 #endif
 
 public:
-   using CommunicatorType = Communicators::MpiCommunicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
-
    PVTUReader() = default;
 
-   PVTUReader( const std::string& fileName, CommunicationGroup group = CommunicatorType::AllGroup )
+   PVTUReader( const std::string& fileName, MPI_Comm group = MPI::AllGroup() )
    : XMLVTK( fileName ), group( group )
    {}
 
@@ -233,7 +230,7 @@ public:
    }
 
 protected:
-   CommunicationGroup group;
+   MPI_Comm group;
 
    int ghostLevels = 0;
    int minCommonVertices = 0;
diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h
index 5aa9cd2b0..2f332d20e 100644
--- a/src/TNL/Meshes/Writers/PVTUWriter.h
+++ b/src/TNL/Meshes/Writers/PVTUWriter.h
@@ -65,9 +65,8 @@ public:
 
    // add all pieces and return the source path for the current rank
    // (useful for parallel writing)
-   template< typename Communicator >
    std::string addPiece( const String& mainFileName,
-                         const typename Communicator::CommunicationGroup group );
+                         const MPI_Comm group );
 
    ~PVTUWriter();
 
diff --git a/src/TNL/Meshes/Writers/PVTUWriter.hpp b/src/TNL/Meshes/Writers/PVTUWriter.hpp
index 71e19da1d..affee65a2 100644
--- a/src/TNL/Meshes/Writers/PVTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/PVTUWriter.hpp
@@ -137,15 +137,14 @@ PVTUWriter< Mesh >::addPiece( const String& mainFileName,
 }
 
 template< typename Mesh >
-   template< typename Communicator >
 std::string
 PVTUWriter< Mesh >::addPiece( const String& mainFileName,
-                              const typename Communicator::CommunicationGroup group )
+                              const MPI_Comm group )
 {
    std::string source;
-   for( int i = 0; i < Communicator::GetSize( group ); i++ ) {
+   for( int i = 0; i < MPI::GetSize( group ); i++ ) {
       const std::string s = addPiece( mainFileName, i );
-      if( i == Communicator::GetRank( group ) )
+      if( i == MPI::GetRank( group ) )
          source = s;
    }
    return source;
diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp
index a2d4f48e9..7003489ab 100644
--- a/src/Tools/tnl-game-of-life.cpp
+++ b/src/Tools/tnl-game-of-life.cpp
@@ -17,13 +17,11 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 using namespace TNL;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 struct MyConfigTag {};
 
 namespace TNL {
@@ -198,8 +196,8 @@ bool runGameOfLife( const Mesh& mesh )
       }
    }
    Index max_count;
-   CommunicatorType::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() );
-   std::cout << "Rank " << CommunicatorType::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl;
+   TNL::MPI::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() );
+   std::cout << "Rank " << TNL::MPI::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl;
    Index reference_cell = 0;
    if( count == max_count ) {
       // find cell which has all points in the central box
@@ -256,7 +254,7 @@ bool runGameOfLife( const Mesh& mesh )
       // create a .pvtu file (only rank 0 actually writes to the file)
       const std::string mainFilePath = "GoL." + std::to_string(iteration) + ".pvtu";
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -266,7 +264,7 @@ bool runGameOfLife( const Mesh& mesh )
       if( mesh.getGhostLevels() > 0 )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
       pvtu.template writePCellData< Real >( "function values" );
-      const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -292,7 +290,7 @@ bool runGameOfLife( const Mesh& mesh )
    Index iteration = 0;
    do {
       iteration++;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          std::cout << "Computing iteration " << iteration << "..." << std::endl;
 
       // iterate over all local entities
@@ -338,7 +336,7 @@ bool runGameOfLife( const Mesh& mesh )
 
       // check if finished
       const bool done = max( f_in.getData() ) == 0 || iteration > max_iter || f_in.getData() == f_out.getData();
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
    }
    while( all_done == false );
 
@@ -351,7 +349,7 @@ void configSetup( Config::ConfigDescription& config )
    config.addRequiredEntry< String >( "input-file", "Input file with the mesh." );
    config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" );
    config.addDelimiter( "MPI settings:" );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int main( int argc, char* argv[] )
@@ -366,7 +364,7 @@ int main( int argc, char* argv[] )
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
-   if( ! CommunicatorType::setup( parameters ) )
+   if( ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String inputFileName = parameters.getParameter< String >( "input-file" );
diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h
index 1b8c59c75..6b748d993 100644
--- a/src/Tools/tnl-test-distributed-mesh.h
+++ b/src/Tools/tnl-test-distributed-mesh.h
@@ -18,13 +18,11 @@
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 using namespace TNL;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 struct MyConfigTag {};
 
 namespace TNL {
@@ -214,7 +212,7 @@ void testSynchronizerOnDevice( const MeshType& mesh )
          if( received != center ) {
             IndexType cellIndexes[ 2 ] = {0, 0};
             const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes );
-            std::cerr << "rank " << CommunicatorType::GetRank()
+            std::cerr << "rank " << TNL::MPI::GetRank()
                       << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")"
                       << " of dimension = " << EntityType::getEntityDimension()
                       << ": received " << received << ", expected = " << center
@@ -224,7 +222,7 @@ void testSynchronizerOnDevice( const MeshType& mesh )
          }
       }
    if( errors > 0 ) {
-      std::cerr << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl;
+      std::cerr << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl;
       TNL_ASSERT_TRUE( false, "test failed" );
    }
 }
@@ -273,7 +271,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
       // create a .pvtu file (only rank 0 actually writes to the file)
       const std::string mainFilePath = "data_" + std::to_string(iteration) + ".pvtu";
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -284,7 +282,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
       pvtu.template writePCellData< Real >( "function values" );
       pvtu.template writePCellData< Real >( "test values" );
-      const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -315,7 +313,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
    int iteration = 0;
    do {
       iteration++;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          std::cout << "Computing iteration " << iteration << "..." << std::endl;
 
       const Index prev_sum = sum( f_K.getData() );
@@ -400,14 +398,14 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
          std::cerr << "ERROR: propatation over faces differs from the propagation over neighbor cells. Differing values are:\n";
          for( Index K = 0; K < f_K_view.getSize(); K++ )
             if( f_K_view[ K ] != f_K_test_view[ K ] )
-               std::cerr << "   rank = " << CommunicatorType::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n";
+               std::cerr << "   rank = " << TNL::MPI::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n";
          std::cerr.flush();
          TNL_ASSERT_TRUE( false, "test failed" );
       }
 
       // check if finished
       const bool done = sum( f_K.getData() ) == prev_sum || iteration > max_iterations;
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
    }
    while( all_done == false );
 
@@ -421,7 +419,7 @@ void configSetup( Config::ConfigDescription& config )
    config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" );
    config.addEntry< int >( "max-iterations", "Maximum number of iterations to compute", 100 );
    config.addDelimiter( "MPI settings:" );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int main( int argc, char* argv[] )
@@ -436,7 +434,7 @@ int main( int argc, char* argv[] )
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
-   if( ! CommunicatorType::setup( parameters ) )
+   if( ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String inputFileName = parameters.getParameter< String >( "input-file" );
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index b778937b6..a0eddd162 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -17,7 +17,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Meshes/Readers/PVTUReader.h>
@@ -32,9 +31,6 @@ using namespace TNL::Meshes::DistributedMeshes;
 
 // cannot be deduced from the grid
 using LocalIndexType = short int;
-// we test only with MPI
-using CommunicatorType = Communicators::MpiCommunicator;
-using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
 
 template< typename Mesh >
 struct GridDistributor;
@@ -54,9 +50,9 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > >
 
    GridDistributor() = delete;
 
-   GridDistributor( CoordinatesType rank_sizes, CommunicationGroup group )
-      : rank(CommunicatorType::GetRank(group)),
-        nproc(CommunicatorType::GetSize(group)),
+   GridDistributor( CoordinatesType rank_sizes, MPI_Comm group )
+      : rank(TNL::MPI::GetRank(group)),
+        nproc(TNL::MPI::GetSize(group)),
         rank_sizes(rank_sizes),
         group(group)
    {}
@@ -328,7 +324,7 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > >
    // input parameters
    int rank, nproc;
    CoordinatesType rank_sizes;
-   CommunicationGroup group;
+   MPI_Comm group;
    // output attributes (byproduct of the decomposition, useful for testing)
    CoordinatesType rank_coordinates, local_size, vert_begin, vert_end, cell_begin, cell_end;
    Index verticesCount, cellsCount, localVerticesCount, localCellsCount;
@@ -341,7 +337,7 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
    using Device = typename Mesh::DeviceType;
 
    // check basic interface
-   EXPECT_EQ( mesh.getCommunicationGroup(), CommunicatorType::AllGroup );
+   EXPECT_EQ( mesh.getCommunicationGroup(), TNL::MPI::AllGroup() );
    EXPECT_EQ( mesh.getGhostLevels(), ghostLevels );
    if( ghostLevels > 0 ) {
       EXPECT_EQ( mesh.template getGlobalIndices< 0 >().getSize(), mesh.getLocalMesh().template getEntitiesCount< 0 >() );
@@ -398,12 +394,12 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
          Containers::Array< Index, Device > vert_sendbuf( distributor.nproc ), cell_sendbuf( distributor.nproc );
          vert_sendbuf.setValue( distributor.localVerticesCount );
          cell_sendbuf.setValue( distributor.localCellsCount );
-         CommunicatorType::Alltoall( vert_sendbuf.getData(), 1,
-                                     vert_offsets.getData(), 1,
-                                     distributor.group );
-         CommunicatorType::Alltoall( cell_sendbuf.getData(), 1,
-                                     cell_offsets.getData(), 1,
-                                     distributor.group );
+         TNL::MPI::Alltoall( vert_sendbuf.getData(), 1,
+                             vert_offsets.getData(), 1,
+                             distributor.group );
+         TNL::MPI::Alltoall( cell_sendbuf.getData(), 1,
+                             cell_offsets.getData(), 1,
+                             distributor.group );
       }
       vert_offsets.setElement( distributor.nproc, 0 );
       cell_offsets.setElement( distributor.nproc, 0 );
@@ -661,7 +657,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh )
          if( received != center ) {
             IndexType cellIndexes[ 2 ] = {0, 0};
             const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes );
-            std::cerr << "rank " << CommunicatorType::GetRank()
+            std::cerr << "rank " << TNL::MPI::GetRank()
                       << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")"
                       << " of dimension = " << EntityType::getEntityDimension()
                       << ": received " << received << ", expected = " << center
@@ -671,7 +667,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh )
          }
       }
    if( errors > 0 )
-      FAIL() << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl;
+      FAIL() << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl;
 }
 
 template< typename Device, typename EntityType, typename MeshType >
@@ -703,10 +699,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel0 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 0;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -720,10 +716,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel1 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 1;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -738,10 +734,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel2 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 2;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -756,10 +752,10 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 2;
    distributor.decompose( grid, mesh, ghostLevels );
 
@@ -769,7 +765,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    std::string subfilePath;
    {
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -780,7 +776,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
          pvtu.template writePCellData< typename Mesh::GlobalIndexType >( "GlobalIndex" );
       }
-      subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -798,7 +794,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    }
 
    // load and test
-   CommunicatorType::Barrier();
+   TNL::MPI::Barrier();
    Readers::PVTUReader reader( mainFilePath );
    reader.detectMesh();
    EXPECT_EQ( reader.getMeshType(), "Meshes::DistributedMesh" );
@@ -812,8 +808,8 @@ TEST( DistributedMeshTest, PVTUWriterReader )
 
    // cleanup
    EXPECT_EQ( fs::remove( subfilePath ), true );
-   CommunicatorType::Barrier();
-   if( CommunicatorType::GetRank() == 0 ) {
+   TNL::MPI::Barrier();
+   if( TNL::MPI::GetRank() == 0 ) {
       EXPECT_EQ( fs::remove( mainFilePath ), true );
       EXPECT_EQ( fs::remove( baseName ), true );
    }
-- 
GitLab


From 037c825547d5f7efe933c5d5abf45cfc212de317 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 14:50:20 +0100
Subject: [PATCH 45/50] MPI refactoring: removed MpiCommunicator from Python
 bindings

---
 src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp | 3 +--
 src/Python/pytnl/tnl_mpi/tnl_mpi.cpp                | 9 +++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
index 089d59adf..17bf57c12 100644
--- a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
@@ -78,8 +78,7 @@ void export_DistributedMeshWriter( py::module & m, const char* name )
             },
             py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
         // NOTE: only the overload intended for sequential writing is exported, because we don't
-        // have type casters for Communicators::MpiCommunicator::CommunicationGroup
-        // (ideally, the communication group would be compatible with the mpi4py objects)
+        // have type casters for MPI_Comm (ideally, it would be compatible with the mpi4py objects)
         .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ),
               py::arg("mainFileName"), py::arg("subdomainIndex"))
     ;
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
index a12060600..a422795b6 100644
--- a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -3,6 +3,7 @@
 
 // conversions have to be registered for each object file
 #include "../tnl_conversions.h"
+#include "TNL/MPI/Wrappers.h"
 
 // external functions
 void export_DistributedMeshes( py::module & m );
@@ -18,15 +19,15 @@ PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m)
 
     // MPI initialization and finalization
     // https://stackoverflow.com/q/64647846
-    if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) {
+    if( ! TNL::MPI::Initialized() ) {
         int argc = 0;
         char** argv = nullptr;
-        TNL::Communicators::MpiCommunicator::Init( argc, argv );
+        TNL::MPI::Init( argc, argv );
     }
     // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors
     auto cleanup_callback = []() {
-        if( TNL::Communicators::MpiCommunicator::IsInitialized() )
-            TNL::Communicators::MpiCommunicator::Finalize();
+        if( TNL::MPI::Initialized() && ! TNL::MPI::Finalized() )
+            TNL::MPI::Finalize();
     };
     m.add_object("_cleanup", py::capsule(cleanup_callback));
 
-- 
GitLab


From db5c4615096ecafd276f93093a7dd697e7715ce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 15:07:52 +0100
Subject: [PATCH 46/50] MPI refactoring: removed MpiCommunicator from the
 distributed grid and related classes

---
 .../tnlDirectEikonalProblem_impl.h            |  12 +-
 .../tnlFastSweepingMethod2D_impl.h            | 188 ++++++-------
 .../tnlFastSweepingMethod3D_impl.h            | 230 +++++++--------
 src/TNL/Functions/CutMeshFunction.h           |  23 +-
 .../DistributedMeshes/DistributedGrid.h       |  81 +++---
 .../DistributedMeshes/DistributedGrid.hpp     | 153 +++++-----
 .../DistributedMeshes/DistributedGridIO.h     |   7 +-
 .../DistributedGridIO_MeshFunction.h          |  28 +-
 .../DistributedGridIO_VectorField.h           |  44 +--
 .../DistributedGridSynchronizer.h             |  18 +-
 .../SubdomainOverlapsGetter.h                 |  49 ++--
 .../SubdomainOverlapsGetter.hpp               |  57 ++--
 .../DistributedMeshes/loadDistributedMesh.h   |   9 +-
 src/TNL/Problems/HeatEquationProblem_impl.h   |  12 +-
 src/TNL/Problems/PDEProblem_impl.h            |   6 +-
 .../Solvers/PDE/TimeDependentPDESolver_impl.h |   2 +-
 .../PDE/TimeIndependentPDESolver_impl.h       |   2 +-
 src/TNL/Solvers/SolverStarter_impl.h          |  12 +-
 src/TNL/Solvers/Solver_impl.h                 |   4 +-
 src/Tools/tnl-init.cpp                        |   4 +-
 src/Tools/tnl-init.h                          |  17 +-
 .../CutDistributedGridTest.cpp                | 116 ++++----
 .../CutDistributedMeshFunctionTest.cpp        |  65 ++---
 .../DistributedMeshes/CutMeshFunctionTest.cpp |  58 ++--
 .../DistributedMeshes/DistributedGridIOTest.h |  24 +-
 .../DistributedGridIO_MPIIOTest.h             |  22 +-
 .../DistributedGridTest_1D.cpp                |  89 +++---
 .../DistributedGridTest_2D.cpp                | 263 +++++++++---------
 .../DistributedGridTest_3D.cpp                |  73 +++--
 .../DistributedVectorFieldIO_MPIIOTestBase.h  |  64 ++---
 30 files changed, 828 insertions(+), 904 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 7bfeb4976..3e1ea757b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod_impl.h
  * Author: oberhuber
  *
@@ -25,7 +25,7 @@ String
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 getType()
 {
-   return String( "DirectEikonalProblem< " + 
+   return String( "DirectEikonalProblem< " +
                   Mesh::getType() + ", " +
                   Anisotropy::getType() + ", " +
                   Real::getType() + ", " +
@@ -54,7 +54,7 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 writeProlog( Logger& logger,
              const Config::ParameterContainer& parameters ) const
 {
-   
+
 }
 
 template< typename Mesh,
@@ -123,7 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( dofs );
   String inputFile = parameters.getParameter< String >( "input-file" );
-  this->initialData->setMesh( this->getMesh() ); 
+  this->initialData->setMesh( this->getMesh() );
   if( CommunicatorType::isDistributed() )
   {
     std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
@@ -132,7 +132,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
     if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
       Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *initialData );
     synchronizer.setDistributedGrid( initialData->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *initialData );
+    synchronizer.synchronize( *initialData );
   }
   else
   {
@@ -190,7 +190,7 @@ solve( DofVectorPointer& dofs )
 {
    FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
    fsm.solve( this->getMesh(), u, anisotropy, initialData );
-   
+
    makeSnapshot();
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index a1ca740e4..14a52ec40 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod2D_impl.h
  * Author: oberhuber
  *
@@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisot
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-  
+
 }
 
 template< typename Real,
@@ -36,7 +36,7 @@ const Index&
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
-  
+
 }
 
 template< typename Real,
@@ -48,68 +48,68 @@ void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-  
+
 }
 
 template< typename Real,
         typename Device,
         typename Index,
         typename Communicator,
-        typename Anisotropy > 
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
         MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
         const MeshFunctionPointer& u )
-{  
+{
   MeshFunctionPointer auxPtr;
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
-  
+
   // Setting overlaps ( WITHOUT MPI SHOULD BE 0 )
   StaticVector vecLowerOverlaps, vecUpperOverlaps;
   setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
-  
+
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
-  
+
   //auxPtr->save( "aux-ini.tnl" );
-  
+
   typename MeshType::Cell cell( *mesh );
-  
+
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
   synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() );
-  synchronizer.template synchronize< Communicator >( aux ); //synchronize initialized overlaps
-  
-  std::cout << "Calculating the values ..." << std::endl; 
+  synchronizer.synchronize( aux ); //synchronize initialized overlaps
+
+  std::cout << "Calculating the values ..." << std::endl;
   while( iteration < this->maxIterations )
   {
-    // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
-    // calculatedBefore is same for all ranks 
+    // calculatedBefore indicates weather we calculated in the last passage of the while cycle
+    // calculatedBefore is same for all ranks
     // without MPI should be FALSE at the end of while cycle body
     int calculatedBefore = 1;
-    
+
     // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle
     // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
-    int calculateMPIAgain = 1;  
-    
+    int calculateMPIAgain = 1;
+
     while( calculatedBefore )
     {
       calculatedBefore = 0;
-      
+
       if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
         calculateMPIAgain = 0;
-        
+
   /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
         /*
          int numThreadsPerBlock = -1;
-         
+
          numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
          if( numThreadsPerBlock <= 16 )
@@ -127,28 +127,28 @@ solve( const MeshPointer& mesh,
          else
          numThreadsPerBlock = 1024;
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-         
+
          if( numThreadsPerBlock == -1 ){
          printf("Fail in setting numThreadsPerBlock.\n");
          break;
          }
-         
-         
-         
+
+
+
          int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-         
+
          //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-         
+
          //Real **sArray = new Real*[numBlocksX*numBlocksY];
          //for( int i = 0; i < numBlocksX * numBlocksY; i++ )
          // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
-         
+
          ArrayContainer BlockIterHost;
          BlockIterHost.setSize( numBlocksX * numBlocksY );
          BlockIterHost.setValue( 1 );
          int IsCalculationDone = 1;
-         
+
          MeshFunctionPointer helpFunc( mesh );
          MeshFunctionPointer helpFunc1( mesh );
          helpFunc1 = auxPtr;
@@ -164,7 +164,7 @@ solve( const MeshPointer& mesh,
          // std::cout<<std::endl;
          unsigned int numWhile = 0;
          while( IsCalculationDone )
-         {      
+         {
          IsCalculationDone = 0;
          helpFunc1 = auxPtr;
          auxPtr = helpFunc;
@@ -185,9 +185,9 @@ solve( const MeshPointer& mesh,
          default:
          this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
          }
-         
-         
-         //Reduction      
+
+
+         //Reduction
          for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
          IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -196,16 +196,16 @@ solve( const MeshPointer& mesh,
          }
          numWhile++;
          //std::cout <<"numWhile = "<< numWhile <<std::endl;
-         
+
          // for( int j = numBlocksY-1; j>-1; j-- ){
          // for( int i = 0; i < numBlocksX; i++ )
          // std::cout << BlockIterHost[ j * numBlocksX + i ];
          // std::cout << std::endl;
          // }
          // std::cout << std::endl;
-         
+
          this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
-         
+
          //std::cout<<std::endl;
          //String s( "aux-"+ std::to_string(numWhile) + ".tnl");
          //aux.save( s );
@@ -215,8 +215,8 @@ solve( const MeshPointer& mesh,
          }
          */
   /**-END-OF-OMP-PARALLEL------------------------------------------------**/
-        
-        
+
+
   // FSM FOR MPI and WITHOUT MPI
         StaticVector boundsFrom; StaticVector boundsTo;
     // UP and RIGHT
@@ -224,75 +224,75 @@ solve( const MeshPointer& mesh,
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save("aux-1.tnl");
-        
+
     // UP and LEFL
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-2.tnl" );
-        
+
     // DOWN and RIGHT
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-3.tnl" );
-        
+
     // DOWN and LEFT
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
+
       }
       if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA?
       {
         calculateMPIAgain = 0;
-          
+
 #ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
         // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 16 );
-        
+
         // Setting number of threads and blocks for kernel
         int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
         int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
         dim3 blockSize( cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY );
-        
+
         // Need for calling functions from kernel
         BaseType ptr;
-        
+
         // True if we should calculate again.
         int calculateCudaBlocksAgain = 1;
-        
+
         // Array that identifies which blocks should be calculated.
         // All blocks should calculate in first passage ( setValue(1) )
         TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY );
         blockCalculationIndicator.setValue( 1 );
         TNL_CHECK_CUDA_DEVICE;
-        
+
         // Array into which we identify the neighbours and then copy it into blockCalculationIndicator
         TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY );
         blockCalculationIndicatorHelp.setValue( 0 );
-        
+
         // number of Blocks for kernel that calculates neighbours.
         int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        
+
         // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         Containers::Vector< RealType, DeviceType, IndexType > helpVec;
         helpVec.setLike( auxPtr.template getData().getData() );
         MeshFunctionPointer helpFunc;
         helpFunc->bind( mesh, helpVec );
-        helpFunc.template modifyData() = auxPtr.template getData(); 
-        
+        helpFunc.template modifyData() = auxPtr.template getData();
+
         // number of iterations of while calculateCudaBlocksAgain
         int numIter = 0;
-               
+
         //int oddEvenBlock = 0;
         while( calculateCudaBlocksAgain )
         {
   /** HERE IS CHESS METHOD (NO MPI) **/
-          
+
           /*
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
@@ -302,25 +302,25 @@ solve( const MeshPointer& mesh,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           
+
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-           
+
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
            helpFunc.template getData< Device>(),
            auxPtr.template modifyData< Device>(),
-           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, 
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           
+
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-           
+
            calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
           */
   /**------------------------------------------------------------------------------------------------*/
-          
-          
+
+
   /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
@@ -328,10 +328,10 @@ solve( const MeshPointer& mesh,
                   blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           // Switching helpFunc and auxPtr.
           auxPtr.swap( helpFunc );
-          
+
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
@@ -340,15 +340,15 @@ solve( const MeshPointer& mesh,
           blockCalculationIndicator = blockCalculationIndicatorHelp;
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
-          
+
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
          if( calculateCudaBlocksAgain ){
             calculatedBefore = 1;
           }
-          
+
 /**-----------------------------------------------------------------------------------------------------------*/
           numIter ++;
         }
@@ -364,13 +364,13 @@ solve( const MeshPointer& mesh,
 #endif
       }
 
-      
-/**----------------------MPI-TO-DO---------------------------------------------**/        
+
+/**----------------------MPI-TO-DO---------------------------------------------**/
 #ifdef HAVE_MPI
       if( CommunicatorType::isDistributed() ){
         getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
-       
-        synchronizer.template synchronize< Communicator >( aux );
+
+        synchronizer.synchronize( aux );
       }
 #endif
       if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
@@ -384,9 +384,9 @@ solve( const MeshPointer& mesh,
 
 // PROTECTED FUNCTIONS:
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
               const MeshPointer& mesh)
@@ -406,11 +406,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-bool 
+bool
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
-goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
         MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
         const AnisotropyPointer& anisotropy )
 {
@@ -418,10 +418,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   const MeshType& mesh = aux.getMesh();
   const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
   const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
-  
+
   typename MeshType::Cell cell( mesh );
   cell.refresh();
-  
+
   for( cell.getCoordinates().y() = boundsFrom[1];
           TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
           cell.getCoordinates().y() += stepY )
@@ -444,54 +444,54 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
 
 
 #ifdef HAVE_MPI
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
-  
+
   int calculateFromNeighbours[4] = {0,0,0,0};
   const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
   MPI::Request *requestsInformation;
-  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
-  
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];
+
   int neighCount = 0; // should this thread calculate again?
-  
+
   if( neighbours[0] != -1 ) // LEFT
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[1] != -1 ) // RIGHT
   {
     requestsInformation[neighCount++] =
-            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
-    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[2] != -1 ) //UP
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
     requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup  );
   }
-  
+
   if( neighbours[5] != -1 ) //DOWN
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
   }
   MPI::WaitAll( requestsInformation, neighCount );
-  
+
   MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
   calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
               calculateFromNeighbours[2] || calculateFromNeighbours[3];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index add4d9610..9468ff1db 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod2D_impl.h
  * Author: oberhuber
  *
@@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisot
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-  
+
 }
 
 template< typename Real,
@@ -36,7 +36,7 @@ const Index&
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
-  
+
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-  
+
 }
 
 template< typename Real,
@@ -67,46 +67,46 @@ solve( const MeshPointer& mesh,
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
-  
+
   // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
   Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps;
   setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
-  
+
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
-  auxPtr->save( "aux-ini.tnl" );   
-  
+  auxPtr->save( "aux-ini.tnl" );
+
   typename MeshType::Cell cell( *mesh );
-  
+
   IndexType iteration( 0 );
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
   synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() );
-  synchronizer.template synchronize< Communicator >( aux ); //synchronization of intial conditions
-  
+  synchronizer.synchronize( aux ); //synchronization of intial conditions
+
   while( iteration < this->maxIterations )
   {
-    // indicates weather we calculated in the last passage of the while cycle 
-    // calculatedBefore is same for all ranks 
+    // indicates weather we calculated in the last passage of the while cycle
+    // calculatedBefore is same for all ranks
     // without MPI should be FALSE at the end of while cycle body
-    int calculatedBefore = 1; 
-    
+    int calculatedBefore = 1;
+
     // indicates if the MPI process should calculate again in upcoming passage of cycle
     // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
-    int calculateMPIAgain = 1; 
-    
+    int calculateMPIAgain = 1;
+
     while( calculatedBefore )
     {
       calculatedBefore = 0;
-      
+
       if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
         calculateMPIAgain = 0;
-        
+
 /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
         /*int numThreadsPerBlock = -1;
-         
+
          numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
          if( numThreadsPerBlock <= 16 )
@@ -124,26 +124,26 @@ solve( const MeshPointer& mesh,
          else
          numThreadsPerBlock = 1024;
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-         
+
          if( numThreadsPerBlock == -1 ){
             printf("Fail in setting numThreadsPerBlock.\n");
          break;
          }
-         
+
          int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
          //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-         
+
          //Real **sArray = new Real*[numBlocksX*numBlocksY];
          // for( int i = 0; i < numBlocksX * numBlocksY; i++ )
          // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
-         
+
          ArrayContainer BlockIterHost;
          BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
          BlockIterHost.setValue( 1 );
          int IsCalculationDone = 1;
-         
+
          MeshFunctionPointer helpFunc( mesh );
          MeshFunctionPointer helpFunc1( mesh );
          helpFunc1 = auxPtr;
@@ -159,7 +159,7 @@ solve( const MeshPointer& mesh,
          // std::cout<<std::endl;
          unsigned int numWhile = 0;
          while( IsCalculationDone  )
-         {      
+         {
          IsCalculationDone = 0;
          helpFunc1 = auxPtr;
          auxPtr = helpFunc;
@@ -180,7 +180,7 @@ solve( const MeshPointer& mesh,
          default:
          this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
          }
-         //Reduction      
+         //Reduction
          for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
          IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -188,10 +188,10 @@ solve( const MeshPointer& mesh,
          }
          }
          numWhile++;
-         
-         
+
+
          this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
-         
+
          //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
          //aux.save( s );
          }
@@ -200,60 +200,60 @@ solve( const MeshPointer& mesh,
          }
          aux = *auxPtr;*/
 /**------------------------------------------------------------------------------*/
-        
-        
+
+
 /** HERE IS FSM WITH MPI AND WITHOUT MPI */
         StaticVector boundsFrom; StaticVector boundsTo;
-        
-    // TOP, NORTH and EAST        
+
+    // TOP, NORTH and EAST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, NORTH and WEST        
+
+    // TOP, NORTH and WEST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, SOUTH and EAST        
+
+    // TOP, SOUTH and EAST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, SOUTH and WEST        
+
+    // TOP, SOUTH and WEST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-            
-    // BOTTOM, NOTH and EAST        
+
+    // BOTTOM, NOTH and EAST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); 
-        
-    // BOTTOM, NOTH and WEST        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+    // BOTTOM, NOTH and WEST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // BOTTOM, SOUTH and EAST        
+
+    // BOTTOM, SOUTH and EAST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
-        
-    // BOTTOM, SOUTH and WEST        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+    // BOTTOM, SOUTH and WEST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
-        
-        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+
   /**----------------------------------------------------------------------------------*/
       }
       if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain )
@@ -263,50 +263,50 @@ solve( const MeshPointer& mesh,
         // the number should be less than 10^3 (num of threads in one grid is maximally 1024)
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 8 );
-        
+
         // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
         int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
         int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
-        int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize );
         if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
           std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-        
+
         // Making the variables for global function CudaUpdateCellCaller.
         dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-        
+
         BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
-        
-        
+
+
         int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
-        
+
         // Array containing information about each block in grid, answering question (Have we calculated in this block?)
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
         BlockIterDevice.setValue( 1 ); // calculate all in the first passage
-        
+
         // Helping Array for GetNeighbours3D
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
         BlockIterPom.setValue( 0 ); //doesnt matter what number
-        
-        
-        
+
+
+
         // number of neighbours in one block (1024 threads) for GetNeighbours3D
         int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
-        
-        
-        //MeshFunctionPointer helpFunc1( mesh );      
+
+
+        //MeshFunctionPointer helpFunc1( mesh );
         Containers::Vector< RealType, DeviceType, IndexType > helpVec;
         helpVec.setLike( auxPtr.template getData().getData() );
         MeshFunctionPointer helpFunc;
         helpFunc->bind( mesh, helpVec );
         helpFunc.template modifyData() = auxPtr.template getData();
         Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-                
+
         int numIter = 0; // number of passages of following while cycle
-        
+
         while( BlockIterD ) //main body of cuda code
         {
-          
+
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // main function that calculates all values in each blocks
           // calculated values are in helpFunc
@@ -319,7 +319,7 @@ solve( const MeshPointer& mesh,
           TNL_CHECK_CUDA_DEVICE;
           // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
           auxPtr.swap( helpFunc );
-          
+
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
           // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
@@ -328,23 +328,23 @@ solve( const MeshPointer& mesh,
           TNL_CHECK_CUDA_DEVICE;
           BlockIterDevice = BlockIterPom;
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-          
+
           // .containsValue(1) is actually parallel reduction implemented in TNL
           BlockIterD = BlockIterDevice.containsValue(1);
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           numIter++;
-          if( BlockIterD ){ 
+          if( BlockIterD ){
             // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
             calculatedBefore = 1;
           }
         }
         if( numIter%2 == 1 ){
-          
+
           // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
           // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
-          auxPtr.swap( helpFunc ); 
+          auxPtr.swap( helpFunc );
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
         }
         cudaDeviceSynchronize();
@@ -353,35 +353,35 @@ solve( const MeshPointer& mesh,
         interfaceMap = *interfaceMapPtr;
 #endif
       }
-      
+
 #ifdef HAVE_MPI
       if( CommunicatorType::isDistributed() )
       {
         getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
 
-        // synchronizate the overlaps 
-        synchronizer.template synchronize< Communicator >( aux );
+        // synchronizate the overlaps
+        synchronizer.synchronize( aux );
 
       }
 #endif
-      
+
       if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0!
         calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
     }
     //aux.save( "aux-8.tnl" );
     iteration++;
-    
+
   }
   // Saving the results into Aux for MakeSnapshot function.
-  Aux = auxPtr; 
+  Aux = auxPtr;
   aux.save("aux-final.tnl");
 }
 
 // PROTECTED FUNCTIONS:
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
               const MeshPointer& mesh)
@@ -402,11 +402,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-bool 
+bool
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
-goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
         MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
         const AnisotropyPointer& anisotropy )
 {
@@ -415,10 +415,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
   const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
   const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1;
-  
+
   typename MeshType::Cell cell( mesh );
   cell.refresh();
-  
+
   for( cell.getCoordinates().z() = boundsFrom[2];
           TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0;
           cell.getCoordinates().z() += stepZ )
@@ -446,72 +446,72 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
 
 
 #ifdef HAVE_MPI
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
-  
+
   int calculateFromNeighbours[6] = {0,0,0,0,0,0};
-        
+
   const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
   MPI::Request *requestsInformation;
-  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
-  
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];
+
   int neighCount = 0; // should this thread calculate again?
-  
+
   if( neighbours[0] != -1 ) // WEST
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[1] != -1 ) // EAST
   {
     requestsInformation[neighCount++] =
-            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
-    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[2] != -1 ) //NORTH
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
     requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[5] != -1 ) //SOUTH
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
   }
-  
-  if( neighbours[8] != -1 ) // TOP 
+
+  if( neighbours[8] != -1 ) // TOP
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[17] != -1 ) //BOTTOM
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup );
   }
-  
+
   MPI::WaitAll( requestsInformation, neighCount );
-  
+
   MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
   calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
                       calculateFromNeighbours[2] || calculateFromNeighbours[3] ||
diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h
index 3cc0af53a..b9ec101cf 100644
--- a/src/TNL/Functions/CutMeshFunction.h
+++ b/src/TNL/Functions/CutMeshFunction.h
@@ -14,9 +14,8 @@
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
-namespace Functions {  
-template <  typename CommunicatorType,
-            typename MeshFunctionType,
+namespace Functions {
+template <  typename MeshFunctionType,
             typename OutMesh,
             typename OutDof,
             int outDimension=OutMesh::getMeshDimension(),
@@ -25,10 +24,10 @@ class CutMeshFunction
 {
   public:
     static bool Cut(MeshFunctionType &inputMeshFunction,
-                    OutMesh &outMesh, 
+                    OutMesh &outMesh,
                     OutDof &outData,
-                    Containers::StaticVector<outDimension, int> savedDimensions, 
-                    Containers::StaticVector<codimension,int> reducedDimensions, 
+                    Containers::StaticVector<outDimension, int> savedDimensions,
+                    Containers::StaticVector<codimension,int> reducedDimensions,
                     Containers::StaticVector<codimension,typename MeshFunctionType::IndexType> fixedIndexs )
     {
         bool inCut;
@@ -44,7 +43,7 @@ class CutMeshFunction
             auto toDistributedGrid=outMesh.getDistributedMesh();
             TNL_ASSERT_TRUE(toDistributedGrid!=nullptr,"You are trying cut distributed meshfunction, but output grid is not set up for distribution");
 
-            inCut=toDistributedGrid-> template SetupByCut<CommunicatorType>(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs);
+            inCut=toDistributedGrid->SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs);
             if(inCut)
             {
                toDistributedGrid->setupGrid(outMesh);
@@ -56,7 +55,7 @@ class CutMeshFunction
         {
             typename OutMesh::PointType outOrigin;
             typename OutMesh::PointType outProportions;
-            typename OutMesh::CoordinatesType outDimensions; 
+            typename OutMesh::CoordinatesType outDimensions;
 
             for(int i=0; i<outDimension;i++)
             {
@@ -64,13 +63,13 @@ class CutMeshFunction
                 outProportions[i]=fromMesh.getProportions()[savedDimensions[i]];
                 outDimensions[i]=fromMesh.getDimensions()[savedDimensions[i]];
             }
-            
+
             outMesh.setDimensions(outDimensions);
             outMesh.setDomain(outOrigin,outProportions);
-            
+
             inCut=true;
             localFixedIndexs=fixedIndexs;
-            
+
         }
 
         //copy data
@@ -104,7 +103,7 @@ class CutMeshFunction
         }
 
         return inCut;
-    } 
+    }
 };
 
 } // namespace Functions
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
index 6e346668c..4082024e3 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
@@ -11,8 +11,6 @@
 
 #pragma once
 
-#include <iostream>
-
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Logger.h>
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
@@ -20,7 +18,7 @@
 
 
 namespace TNL {
-namespace Meshes { 
+namespace Meshes {
 namespace DistributedMeshes {
 
 
@@ -28,7 +26,7 @@ namespace DistributedMeshes {
 template< int Dimension,
           typename Real,
           typename Device,
-          typename Index >     
+          typename Index >
 class DistributedMesh< Grid< Dimension, Real, Device, Index > >
 {
   public:
@@ -41,44 +39,43 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       typedef Containers::StaticVector< Dimension, IndexType > CoordinatesType;
       typedef Containers::StaticVector< Dimension, IndexType > SubdomainOverlapsType;
 
-      static constexpr int getMeshDimension() { return Dimension; };  
+      static constexpr int getMeshDimension() { return Dimension; };
 
-      static constexpr int getNeighborsCount() { return DirectionCount<Dimension>::get(); } //c++14 may use Directions::pow3(Dimension)-1 
+      static constexpr int getNeighborsCount() { return DirectionCount<Dimension>::get(); } //c++14 may use Directions::pow3(Dimension)-1
 
       DistributedMesh();
 
       ~DistributedMesh();
-      
+
       static void configSetup( Config::ConfigDescription& config );
-      
+
       bool setup( const Config::ParameterContainer& parameters,
-                  const String& prefix );      
-    
+                  const String& prefix );
+
       void setDomainDecomposition( const CoordinatesType& domainDecomposition );
-      
+
       const CoordinatesType& getDomainDecomposition() const;
-      
-      template< typename CommunicatorType >
+
       void setGlobalGrid( const GridType& globalGrid );
-      
+
       const GridType& getGlobalGrid() const;
-      
+
       void setOverlaps( const SubdomainOverlapsType& lower,
                         const SubdomainOverlapsType& upper);
-      
+
       void setupGrid( GridType& grid);
 
       bool isDistributed() const;
-      
+
       bool isBoundarySubdomain() const;
-           
+
       // TODO: replace it with getLowerOverlap() and getUpperOverlap()
       // It is still being used in cuts set-up
       const CoordinatesType& getOverlap() const { return this->overlap;};
-      
+
       //currently used overlaps at this subdomain
       const SubdomainOverlapsType& getLowerOverlap() const;
-      
+
       const SubdomainOverlapsType& getUpperOverlap() const;
 
       //number of elements of local sub domain WITHOUT overlap
@@ -95,7 +92,7 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       //number of elements of local sub domain WITH overlap
       // TODO: replace with localGrid
       const CoordinatesType& getLocalGridSize() const;
-       
+
       //coordinates of begin of local subdomain without overlaps in local grid
       const CoordinatesType& getLocalBegin() const;
 
@@ -104,40 +101,40 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       const PointType& getLocalOrigin() const;
       const PointType& getSpaceSteps() const;
 
-      //aka MPI-communcicator  
-      void setCommunicationGroup(void * group);
-      void * getCommunicationGroup() const;
+      //aka MPI-communcicator
+      void setCommunicationGroup(MPI_Comm group);
+      MPI_Comm getCommunicationGroup() const;
 
       template< int EntityDimension >
       IndexType getEntitiesCount() const;
 
       template< typename Entity >
-      IndexType getEntitiesCount() const; 
+      IndexType getEntitiesCount() const;
 
       const int* getNeighbors() const;
-      
-      const int* getPeriodicNeighbors() const;      
 
-      template<typename CommunicatorType, typename DistributedGridType>
-      bool SetupByCut(DistributedGridType &inputDistributedGrid, 
-                 Containers::StaticVector<Dimension, int> savedDimensions, 
-                 Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions, 
+      const int* getPeriodicNeighbors() const;
+
+      template<typename DistributedGridType>
+      bool SetupByCut(DistributedGridType &inputDistributedGrid,
+                 Containers::StaticVector<Dimension, int> savedDimensions,
+                 Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions,
                  Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,IndexType> fixedIndexs);
 
       int getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const;
-      
+
       String printProcessCoords() const;
 
       String printProcessDistr() const;
-      
+
       void writeProlog( Logger& logger );
 
-   public: 
-      
+   public:
+
       bool isThereNeighbor(const CoordinatesType &direction) const;
 
       void setupNeighbors();
-      
+
       void print( std::ostream& str ) const;
 
       GridType globalGrid;
@@ -149,26 +146,26 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       //CoordinatesType globalDimensions;
       CoordinatesType globalBegin;
       PointType spaceSteps;
-      
+
       SubdomainOverlapsType lowerOverlap, upperOverlap, globalLowerOverlap, globalUpperOverlap;
 
       CoordinatesType domainDecomposition;
-      CoordinatesType subdomainCoordinates;   
+      CoordinatesType subdomainCoordinates;
 
       // TODO: static arrays
       int neighbors[ getNeighborsCount() ];
       int periodicNeighbors[ getNeighborsCount() ];
 
-      IndexType Dimensions;        
+      IndexType Dimensions;
       bool distributed;
-        
+
       int rank;
       int nproc;
 
       bool isSet;
 
-      //aka MPI-communicator 
-      void * communicationGroup;
+      //aka MPI-communicator
+      MPI_Comm group;
 
 };
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
index a35b53962..c48fec9af 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
@@ -11,9 +11,9 @@
 #pragma once
 
 #include <cstdlib>
-#include <TNL/Communicators/MpiCommunicator.h>
 
 #include "DistributedGrid.h"
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
    namespace Meshes {
@@ -28,8 +28,6 @@ template<int Dimension, typename Real, typename Device, typename Index >
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 ~DistributedMesh()
 {
-    if(isSet && this->communicationGroup!=nullptr)
-        std::free(this->communicationGroup);
 }
 
 
@@ -57,7 +55,7 @@ setup( const Config::ParameterContainer& parameters,
    return true;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setDomainDecomposition( const CoordinatesType& domainDecomposition )
@@ -65,7 +63,7 @@ setDomainDecomposition( const CoordinatesType& domainDecomposition )
    this->domainDecomposition = domainDecomposition;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getDomainDecomposition() const
@@ -73,18 +71,12 @@ getDomainDecomposition() const
    return this->domainDecomposition;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
-template< typename CommunicatorType >
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setGlobalGrid( const GridType &globalGrid )
 {
-   if(this->isSet && this->communicationGroup != nullptr)
-        std::free(this->communicationGroup);
-   this->communicationGroup= std::malloc(sizeof(typename CommunicatorType::CommunicationGroup));
-
-   *((typename CommunicatorType::CommunicationGroup *)this->communicationGroup) = CommunicatorType::AllGroup;
-   auto group=*((typename CommunicatorType::CommunicationGroup *)this->communicationGroup);
+   this->group = MPI::AllGroup();
 
    this->globalGrid = globalGrid;
    this->isSet=true;
@@ -99,15 +91,12 @@ setGlobalGrid( const GridType &globalGrid )
    this->spaceSteps=globalGrid.getSpaceSteps();
    this->distributed=false;
 
-   if( CommunicatorType::IsInitialized() )
+   this->rank=MPI::GetRank(group);
+   this->nproc=MPI::GetSize(group);
+   //use MPI only if have more than one process
+   if(this->nproc>1)
    {
-      this->rank=CommunicatorType::GetRank(group);
-      this->nproc=CommunicatorType::GetSize(group);
-      //use MPI only if have more than one process
-      if(this->nproc>1)
-      {
-         this->distributed=true;
-      }
+      this->distributed=true;
    }
 
    if( !this->distributed )
@@ -127,10 +116,8 @@ setGlobalGrid( const GridType &globalGrid )
       //compute node distribution
       int dims[ Dimension ];
       for( int i = 0; i < Dimension; i++ )
-         dims[ i ]= this->domainDecomposition[ i ];
-
-
-      CommunicatorType::DimsCreate( this->nproc, Dimension, dims );
+         dims[ i ] = this->domainDecomposition[ i ];
+      MPI::Compute_dims( this->nproc, Dimension, dims );
       for( int i = 0; i < Dimension; i++ )
          this->domainDecomposition[ i ] = dims[ i ];
 
@@ -146,16 +133,16 @@ setGlobalGrid( const GridType &globalGrid )
       for( int i = 0; i < Dimension; i++ )
       {
          numberOfLarger[ i ] = globalGrid.getDimensions()[ i ] % this->domainDecomposition[ i ];
-         
+
          this->localSize[ i ] = globalGrid.getDimensions()[ i ] / this->domainDecomposition[ i ];
-         
+
          if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] )
             this->localSize[ i ] += 1;
-         
+
          if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] )
              this->globalBegin[ i ] = this->subdomainCoordinates[ i ] * this->localSize[ i ];
          else
-             this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + 
+             this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) +
                                      ( this->subdomainCoordinates[ i ] - numberOfLarger[ i ] ) * this->localSize[ i ];
       }
 
@@ -164,7 +151,7 @@ setGlobalGrid( const GridType &globalGrid )
   }
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setOverlaps( const SubdomainOverlapsType& lower,
@@ -191,7 +178,7 @@ setupGrid( GridType& grid)
    grid.setDistMesh(this);
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getSubdomainCoordinates() const
@@ -199,7 +186,7 @@ getSubdomainCoordinates() const
    return this->subdomainCoordinates;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalOrigin() const
@@ -207,15 +194,15 @@ getLocalOrigin() const
    return this->localOrigin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getSpaceSteps() const
 {
    return this->spaceSteps;
 }
-   
-template< int Dimension, typename Real, typename Device, typename Index >     
+
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isDistributed() const
@@ -223,7 +210,7 @@ isDistributed() const
    return this->distributed;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isBoundarySubdomain() const
@@ -234,7 +221,7 @@ isBoundarySubdomain() const
    return false;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLowerOverlap() const
@@ -242,7 +229,7 @@ getLowerOverlap() const
    return this->lowerOverlap;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getUpperOverlap() const
@@ -250,7 +237,7 @@ getUpperOverlap() const
    return this->upperOverlap;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalSize() const
@@ -258,7 +245,7 @@ getLocalSize() const
    return this->localSize;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalSize() const
@@ -266,7 +253,7 @@ getGlobalSize() const
    return this->globalGrid.getDimensions();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::GridType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalGrid() const
@@ -274,7 +261,7 @@ getGlobalGrid() const
     return this->globalGrid;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalBegin() const
@@ -282,7 +269,7 @@ getGlobalBegin() const
    return this->globalBegin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalGridSize() const
@@ -290,7 +277,7 @@ getLocalGridSize() const
    return this->localGridSize;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalBegin() const
@@ -298,7 +285,7 @@ getLocalBegin() const
    return this->localBegin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >      
+template< int Dimension, typename Real, typename Device, typename Index >
    template< int EntityDimension >
 Index
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
@@ -307,7 +294,7 @@ getEntitiesCount() const
    return this->globalGrid. template getEntitiesCount< EntityDimension >();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >       
+template< int Dimension, typename Real, typename Device, typename Index >
    template< typename Entity >
 Index
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
@@ -316,23 +303,23 @@ getEntitiesCount() const
    return this->globalGrid. template getEntitiesCount< Entity >();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
-void 
+template< int Dimension, typename Real, typename Device, typename Index >
+void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
-setCommunicationGroup(void * group)
+setCommunicationGroup(MPI_Comm group)
 {
-    this->communicationGroup=group;
+    this->group=group;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
-void *
+template< int Dimension, typename Real, typename Device, typename Index >
+MPI_Comm
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getCommunicationGroup() const
 {
-    return this->communicationGroup;
+    return this->group;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 int
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const
@@ -347,7 +334,7 @@ getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const
     return ret;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isThereNeighbor(const CoordinatesType &direction) const
@@ -365,7 +352,7 @@ isThereNeighbor(const CoordinatesType &direction) const
 
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setupNeighbors()
@@ -378,7 +365,7 @@ setupNeighbors()
          this->neighbors[ i ] = this->getRankOfProcCoord( coordinates );
       else
          this->neighbors[ i ] = -1;
-      
+
       // Handling periodic neighbors
       for( int d = 0; d < Dimension; d++ )
       {
@@ -388,12 +375,12 @@ setupNeighbors()
             coordinates[ d ] = 0;
          this->periodicNeighbors[ i ] = this->getRankOfProcCoord( coordinates );
       }
-      
+
       //std::cout << "Setting i-th neighbour to " << neighbors[ i ] << " and " << periodicNeighbors[ i ] << std::endl;
    }
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >   
+template< int Dimension, typename Real, typename Device, typename Index >
 const int*
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getNeighbors() const
@@ -402,7 +389,7 @@ getNeighbors() const
     return this->neighbors;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >   
+template< int Dimension, typename Real, typename Device, typename Index >
 const int*
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getPeriodicNeighbors() const
@@ -412,12 +399,12 @@ getPeriodicNeighbors() const
 }
 
 template< int Dimension, typename Real, typename Device, typename Index >
-    template<typename CommunicatorType, typename DistributedGridType >
-bool 
+    template<typename DistributedGridType >
+bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
-SetupByCut(DistributedGridType &inputDistributedGrid, 
-         Containers::StaticVector<Dimension, int> savedDimensions, 
-         Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions, 
+SetupByCut(DistributedGridType &inputDistributedGrid,
+         Containers::StaticVector<Dimension, int> savedDimensions,
+         Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions,
          Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,IndexType> fixedIndexs)
 {
 
@@ -432,21 +419,17 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
       }
 
       //create new group with used nodes
-      typename CommunicatorType::CommunicationGroup *oldGroup=(typename CommunicatorType::CommunicationGroup *)(inputDistributedGrid.getCommunicationGroup());
-      if(this->isSet && this->communicationGroup != nullptr)
-            free(this->communicationGroup);
-      this->communicationGroup = std::malloc(sizeof(typename CommunicatorType::CommunicationGroup));
-
+      const MPI_Comm oldGroup=inputDistributedGrid.getCommunicationGroup();
       if(isInCut)
       {
            this->isSet=true;
-            
+
             auto fromGlobalMesh=inputDistributedGrid.getGlobalGrid();
             //set global grid
             typename GridType::PointType outOrigin;
             typename GridType::PointType outProportions;
             typename GridType::CoordinatesType outDimensions;
-            
+
             for(int i=0; i<Dimension;i++)
             {
                 outOrigin[i]=fromGlobalMesh.getOrigin()[savedDimensions[i]];
@@ -468,14 +451,13 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
                 this->spaceSteps[i]=inputDistributedGrid.getSpaceSteps()[savedDimensions[i]];
             }
 
-            int newRank= getRankOfProcCoord(this->subdomainCoordinates);
-
-            CommunicatorType::CreateNewGroup(isInCut,newRank,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup));
+            int newRank = getRankOfProcCoord(this->subdomainCoordinates);
+            this->group = MPI::Comm_split( oldGroup, 1, newRank );
 
             setupNeighbors();
 
 
-            
+
             bool isDistributed=false;
             for(int i=0;i<Dimension; i++)
             {
@@ -483,7 +465,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
             }
 
             this->distributed=isDistributed;
-            
+
             this->globalGrid.setDimensions(outDimensions);
             this->globalGrid.setDomain(outOrigin,outProportions);
 
@@ -491,7 +473,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
       }
       else
       {
-         CommunicatorType::CreateNewGroup(isInCut,0,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup));
+         this->group = MPI::Comm_split( oldGroup, MPI_UNDEFINED, 0 );
       }
 
       return false;
@@ -517,7 +499,7 @@ printProcessDistr() const
    for(int i=1; i<Dimension; i++)
         res=res+String("-")+convertToString(this->domainDecomposition[i]);
    return res;
-};  
+};
 
 template< int Dimension, typename Real, typename Device, typename Index >
 void
@@ -525,19 +507,18 @@ DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 writeProlog( Logger& logger )
 {
    logger.writeParameter( "Domain decomposition:", this->getDomainDecomposition() );
-}           
+}
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 print( std::ostream& str ) const
 {
-   using Communicator = Communicators::MpiCommunicator;
-   for( int j = 0; j < Communicator::GetSize( Communicator::AllGroup ); j++ )
+   for( int j = 0; j < MPI::GetSize(); j++ )
    {
-      if( j == Communicator::GetRank( Communicator::AllGroup ) )
+      if( j == MPI::GetRank() )
       {
-         str << "Node : " << Communicator::GetRank( Communicator::AllGroup ) << std::endl
+         str << "Node : " << MPI::GetRank() << std::endl
              << " localOrigin : " << localOrigin << std::endl
              << " localBegin : " << localBegin << std::endl
              << " localSize : " << localSize  << std::endl
@@ -558,7 +539,7 @@ print( std::ostream& str ) const
             str << " " << periodicNeighbors[ i ];
          str << std::endl;
       }
-      Communicator::Barrier( Communicator::AllGroup );
+      MPI::Barrier();
    }
 }
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
index 38a7c04f0..edb08baf7 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
@@ -13,7 +13,6 @@
 #include <iostream>
 
 #include <TNL/File.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h>
 #include <TNL/Functions/MeshFunction.h>
@@ -21,11 +20,11 @@
 #include <TNL/Devices/Cuda.h>
 
 namespace TNL {
-namespace Meshes {   
+namespace Meshes {
 namespace DistributedMeshes {
 
 enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1, MpiIO=2 };
-    
+
 template< typename MeshFunction,
           DistrGridIOTypes type = LocalCopy,
           typename Mesh = typename MeshFunction::MeshType,
@@ -34,7 +33,7 @@ class DistributedGridIO
 {
 };
 
-template< typename MeshFunctionType > 
+template< typename MeshFunctionType >
 class DistributedGridIO< MeshFunctionType, Dummy >
 {
     bool save(const String& fileName, MeshFunctionType &meshFunction)
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
index 99f505bba..698d7e41d 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -159,7 +159,7 @@ class DistributedGridIO_MPIIOBase
             meshFunction.save(fileName);
         }
 
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
 
 	   MPI_File file;
       int ok=MPI_File_open( group,
@@ -182,7 +182,7 @@ class DistributedGridIO_MPIIOBase
     {
 
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
@@ -191,7 +191,7 @@ class DistributedGridIO_MPIIOBase
 
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
-       if(Communicators::MpiCommunicator::GetRank(group)==0)
+       if(MPI::GetRank(group)==0)
        {
             MPI_File_seek(file,offset,MPI_SEEK_SET);
             headerSize=writeMeshFunctionHeader(file,meshFunction,dataCount);
@@ -334,7 +334,7 @@ class DistributedGridIO_MPIIOBase
          return true;
       }
 
-      MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+      MPI_Comm group=distrGrid->getCommunicationGroup();
 
       MPI_File file;
       if( MPI_File_open( group,
@@ -357,7 +357,7 @@ class DistributedGridIO_MPIIOBase
     {
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
 
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
@@ -366,7 +366,7 @@ class DistributedGridIO_MPIIOBase
 
        int headerSize=0;
 
-       if(Communicators::MpiCommunicator::GetRank(group)==0)
+       if(MPI::GetRank(group)==0)
        {
             MPI_File_seek(file,offset,MPI_SEEK_SET);
             headerSize=readMeshFunctionHeader(file,meshFunction,dataCount);
@@ -443,7 +443,7 @@ class DistributedGridIO<
       static bool save(const String& fileName, MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
@@ -452,14 +452,14 @@ class DistributedGridIO<
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
       };
 
       static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
@@ -470,7 +470,7 @@ class DistributedGridIO<
             return true;
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 };
@@ -492,26 +492,26 @@ class DistributedGridIO<
       static bool save(const String& fileName, MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             typename MeshFunctionType::RealType* data=meshFunction.getData().getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 
       static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             typename MeshFunctionType::RealType* data = meshFunction.getData().getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::load(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 };
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
index 52217c336..8febf3c72 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
@@ -49,7 +49,7 @@ class DistributedGridIO_VectorField<
    static bool save(const String& fileName, Functions::VectorField< Size, MeshFunctionType > &vectorField)
    {
 #ifdef HAVE_MPI
-        if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+        if(MPI::isInitialized())//i.e. - isUsed
         {
             auto *distrGrid=vectorField.getMesh().getDistributedMesh();
 			if(distrGrid==NULL)
@@ -58,9 +58,9 @@ class DistributedGridIO_VectorField<
                                 return true;
 			}
 
-            MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+            MPI_Comm group=distrGrid->getCommunicationGroup();
 
-           //write 
+           //write
            MPI_File file;
            MPI_File_open( group,
                           const_cast< char* >( fileName.getString() ),
@@ -68,12 +68,12 @@ class DistributedGridIO_VectorField<
                           MPI_INFO_NULL,
                           &file);
 
-          
-           int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset      
-           if(Communicators::MpiCommunicator::GetRank(group)==0)
+
+           int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset
+           if(MPI::GetRank(group)==0)
                offset+=writeVectorFieldHeader(file,vectorField);
            MPI_Bcast(&offset, 1, MPI_INT,0, group);
-           
+
            for( int i = 0; i < vectorField.getVectorDimension(); i++ )
            {
                typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData();  //here manage data transfer Device...
@@ -83,13 +83,13 @@ class DistributedGridIO_VectorField<
                   return false;
            }
 
-           MPI_File_close(&file); 
-           return true;           
+           MPI_File_close(&file);
+           return true;
         }
 #endif
-        std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+        std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
         return false;
-      
+
     };
 
 #ifdef HAVE_MPI
@@ -140,7 +140,7 @@ class DistributedGridIO_VectorField<
     static bool load(const String& fileName, Functions::VectorField<Size,MeshFunctionType> &vectorField)
     {
 #ifdef HAVE_MPI
-        if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+        if(MPI::isInitialized())//i.e. - isUsed
         {
             auto *distrGrid=vectorField.getMesh().getDistributedMesh();
 			if(distrGrid==NULL)
@@ -149,9 +149,9 @@ class DistributedGridIO_VectorField<
                                 return true;
 			}
 
-            MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+            MPI_Comm group=distrGrid->getCommunicationGroup();
 
-           //write 
+           //write
            MPI_File file;
            MPI_File_open( group,
                           const_cast< char* >( fileName.getString() ),
@@ -159,12 +159,12 @@ class DistributedGridIO_VectorField<
                           MPI_INFO_NULL,
                           &file);
 
-          
-           int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset      
-           if(Communicators::MpiCommunicator::GetRank(group)==0)
+
+           int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset
+           if(MPI::GetRank(group)==0)
                offset+=readVectorFieldHeader(file,vectorField);
            MPI_Bcast(&offset, 1, MPI_INT,0, group);
-           
+
            for( int i = 0; i < vectorField.getVectorDimension(); i++ )
            {
                typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData();  //here manage data transfer Device...
@@ -174,13 +174,13 @@ class DistributedGridIO_VectorField<
                   return false;
            }
 
-           MPI_File_close(&file); 
-           return true;           
+           MPI_File_close(&file);
+           return true;
         }
 #endif
-        std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+        std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
         return false;
-      
+
     };
 
 };
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
index 7bc17f920..ed68150a0 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
@@ -111,8 +111,7 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
          }
      }
 
-      template< typename CommunicatorType,
-                typename MeshFunctionType,
+      template< typename MeshFunctionType,
                 typename PeriodicBoundariesMaskPointer = Pointers::SharedPointer< MeshFunctionType > >
       void synchronize( MeshFunctionType &meshFunction,
                         bool periodicBoundaries = false,
@@ -144,9 +143,8 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
             PeriodicBoundariesMaskPointer( nullptr ) ); // the mask is used only when receiving data );
 
          //async send and receive
-         typename CommunicatorType::Request requests[2*this->getNeighborCount()];
-         typename CommunicatorType::CommunicationGroup group;
-         group=*((typename CommunicatorType::CommunicationGroup *)(distributedGrid->getCommunicationGroup()));
+         MPI_Request requests[2*this->getNeighborCount()];
+         MPI_Comm group = distributedGrid->getCommunicationGroup();
          int requestsCount( 0 );
 
          //send everything, recieve everything
@@ -158,22 +156,22 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
             if( neighbors[ i ] != -1 )
             {
                //TNL_MPI_PRINT( "Sending data to node " << neighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
+               requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
                //TNL_MPI_PRINT( "Receiving data from node " << neighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
+               requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
             }
             else if( periodicBoundaries && sendSizes[ i ] !=0 )
       	   {
                //TNL_MPI_PRINT( "Sending data to node " << periodicNeighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
+               requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
                //TNL_MPI_PRINT( "Receiving data to node " << periodicNeighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
+               requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
             }
          }
 
         //wait until send is done
         //TNL_MPI_PRINT( "Waiting for data ..." )
-        CommunicatorType::WaitAll( requests, requestsCount );
+        MPI::Waitall( requests, requestsCount );
 
         //copy data from receive buffers
         //TNL_MPI_PRINT( "Copying data ..." )
diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
index 851ff6627..b479544f7 100644
--- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
+++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
@@ -16,23 +16,21 @@
 namespace TNL {
    namespace Meshes {
       namespace DistributedMeshes {
-      
-template< typename Mesh,
-          typename Communicator >
+
+template< typename Mesh >
 class SubdomainOverlapsGetter
 {};
 
-// TODO: Specializations by the grid dimension can be avoided when the MPI directions are 
+// TODO: Specializations by the grid dimension can be avoided when the MPI directions are
 // rewritten in a dimension independent way
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 1;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -40,10 +38,9 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1,
        */
@@ -53,18 +50,17 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 2;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -72,10 +68,9 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * lower.y() is overlap of the subdomain at boundary where y = 0,
        * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1,
@@ -87,17 +82,16 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 3;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -105,10 +99,9 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * lower.y() is overlap of the subdomain at boundary where y = 0,
        * lower.z() is overlap of the subdomain at boundary where z = 0,
@@ -122,7 +115,7 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 
diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
index 9dbb1372b..aa185e1ec 100644
--- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Assert.h>
 #include <TNL/Meshes/Grid.h>
 
@@ -19,26 +20,25 @@ namespace TNL {
 
 /*
  * TODO: This could work when the MPI directions are rewritten
-         
+
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
              IndexType subdomainOverlapSize,
              const SubdomainOverlapsType& periodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( ! MPI::isDistributed() )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    for( int i = 0; i < Dimension; i++ )
    {
       CoordinatesType neighborDirection( 0 );
@@ -47,7 +47,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
          lower[ i ] = subdomainOverlapSize;
       else if( distributedMesh->getPeriodicNeighbors()[ Directions::getDirection( neighborDirection ) ] != rank )
          lower[ i ] = periodicBoundariesOverlapSize[ i ];
-      
+
       neighborDirection[ i ] = 1;
       if( subdomainCoordinates[ i ] < distributedMesh->getDomainDecomposition()[ i ] - 1 )
          upper[ i ] = subdomainOverlapSize;
@@ -55,15 +55,14 @@ getOverlaps( const DistributedMeshType* distributedMesh,
          upper[ i ] = periodicBoundariesOverlapSize[ i ];
    }
 }
- 
+
 */
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -71,13 +70,13 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -92,10 +91,9 @@ getOverlaps( const DistributedMeshType* distributedMesh,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -103,15 +101,15 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   int rank = MPI::GetRank();
    lower = 0;
    upper = 0;
-   
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -121,7 +119,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank )
       upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ];
-   
+
    if( subdomainCoordinates[ 1 ] > 0 )
       lower[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank )
@@ -135,10 +133,9 @@ getOverlaps( const DistributedMeshType* distributedMesh,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -146,13 +143,13 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -162,7 +159,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank )
       upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ];
-   
+
    if( subdomainCoordinates[ 1 ] > 0 )
       lower[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank )
@@ -172,7 +169,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYpXz ] != rank )
       upper[ 1 ] = upperPeriodicBoundariesOverlapSize[ 1 ];
-   
+
    if( subdomainCoordinates[ 2 ] > 0 )
       lower[ 2 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZmYzXz ] != rank )
diff --git a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
index 135e3c15a..52c0b543b 100644
--- a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
+++ b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
@@ -94,8 +94,7 @@ resolveAndLoadDistributedMesh( Functor&& functor,
    return resolveDistributedMeshType< ConfigTag, Device >( wrapper, fileName, fileFormat );
 }
 
-template< typename CommunicatorType,
-          typename MeshConfig,
+template< typename MeshConfig,
           typename Device >
 bool
 loadDistributedMesh( Mesh< MeshConfig, Device >& mesh,
@@ -145,8 +144,7 @@ decomposeMesh( const Config::ParameterContainer& parameters,
 }
 
 // overloads for grids
-template< typename CommunicatorType,
-          int Dimension,
+template< int Dimension,
           typename Real,
           typename Device,
           typename Index >
@@ -171,7 +169,7 @@ loadDistributedMesh( Grid< Dimension, Real, Device, Index >& mesh,
    std::cout << " [ OK ] " << std::endl;
 
    typename Meshes::DistributedMeshes::DistributedMesh<Grid< Dimension, Real, Device, Index >>::SubdomainOverlapsType overlap;
-   distributedMesh.template setGlobalGrid< CommunicatorType >( globalGrid );
+   distributedMesh.setGlobalGrid( globalGrid );
    distributedMesh.setupGrid(mesh);
    return true;
 }
@@ -191,7 +189,6 @@ decomposeMesh( const Config::ParameterContainer& parameters,
    using GridType = Grid< Dimension, Real, Device, Index >;
    using DistributedGridType = DistributedMeshes::DistributedMesh< GridType >;
    using SubdomainOverlapsType = typename DistributedGridType::SubdomainOverlapsType;
-   using CommunicatorType = typename Problem::CommunicatorType;
 
    SubdomainOverlapsType lower( 0 ), upper( 0 );
    distributedMesh.setOverlaps( lower, upper );
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 1da61c51e..131697afb 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -146,7 +146,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
         if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
             Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(initialConditionFile, *uPointer );
         synchronizer.setDistributedGrid( uPointer->getMesh().getDistributedMesh() );
-        synchronizer.template synchronize<CommunicatorType>( *uPointer );
+        synchronizer.synchronize( *uPointer );
     }
     else
     {
@@ -173,7 +173,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-   template< typename MatrixPointer >          
+   template< typename MatrixPointer >
 bool
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 setupLinearSystem( MatrixPointer& matrixPointer )
@@ -247,7 +247,7 @@ getExplicitUpdate( const RealType& time,
     *
     * You may use supporting vectors again if you need.
     */
-   
+
    this->bindDofs( uDofs );
    this->fuPointer->bind( this->getMesh(), *fuDofs );
    this->explicitUpdater.template update< typename Mesh::Cell, Communicator >( time, tau, this->getMesh(), this->uPointer, this->fuPointer );
@@ -258,7 +258,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-void 
+void
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 applyBoundaryConditions( const RealType& time,
                          DofVectorPointer& uDofs )
@@ -272,7 +272,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-    template< typename MatrixPointer > 
+    template< typename MatrixPointer >
 void
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 assemblyLinearSystem( const RealType& time,
@@ -282,7 +282,7 @@ assemblyLinearSystem( const RealType& time,
                       DofVectorPointer& bPointer )
 {
    this->bindDofs( dofsPointer );
-   this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( 
+   this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >(
       time,
       tau,
       this->getMesh(),
diff --git a/src/TNL/Problems/PDEProblem_impl.h b/src/TNL/Problems/PDEProblem_impl.h
index 6a3aa63e6..f42f18b16 100644
--- a/src/TNL/Problems/PDEProblem_impl.h
+++ b/src/TNL/Problems/PDEProblem_impl.h
@@ -59,7 +59,7 @@ template< typename Mesh,
 typename PDEProblem< Mesh, Communicator, Real, Device, Index >::IndexType
 PDEProblem< Mesh, Communicator, Real, Device, Index >::
 subdomainOverlapSize()
-{ 
+{
    return 1;
 }
 
@@ -77,9 +77,9 @@ getSubdomainOverlaps( const Config::ParameterContainer& parameters,
                       SubdomainOverlapsType& upper )
 {
    using namespace Meshes::DistributedMeshes;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() );
 }
-      
+
 template< typename Mesh,
           typename Communicator,
           typename Real,
diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
index 0c605fb95..34f2798f8 100644
--- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
@@ -63,7 +63,7 @@ setup( const Config::ParameterContainer& parameters,
    const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" );
    this->distributedMesh.setup( parameters, prefix );
    if( Problem::CommunicatorType::isDistributed() ) {
-      if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
+      if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
          return false;
       if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) )
          return false;
diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
index 5292e7f41..880d0ab31 100644
--- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
@@ -54,7 +54,7 @@ setup( const Config::ParameterContainer& parameters,
    const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" );
    this->distributedMesh.setup( parameters, prefix );
    if( Problem::CommunicatorType::isDistributed() ) {
-      if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
+      if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
          return false;
       if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) )
          return false;
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index fa1d23951..dbecdaad9 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/String.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Devices/Host.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Solvers/BuildConfigTags.h>
 #include <TNL/Solvers/ODE/Merson.h>
@@ -24,14 +24,14 @@
 #include <TNL/Solvers/PDE/PDESolverTypeResolver.h>
 
 namespace TNL {
-namespace Solvers {   
+namespace Solvers {
 
 template< typename Problem,
           typename ConfigTag,
           bool TimeDependent = Problem::isTimeDependent() >
 class TimeDependencyResolver
 {};
-   
+
 template< typename Problem,
           typename ConfigTag,
           typename TimeStepper = typename Problem::TimeStepper >
@@ -65,7 +65,7 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame
     */
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! Communicators::MpiCommunicator::setup( parameters ) 
+       ! MPI::setup( parameters )
     )
       return false;
    Problem problem;
@@ -93,7 +93,7 @@ class TimeDependencyResolver< Problem, ConfigTag, false >
                        const Config::ParameterContainer& parameters )
       {
          // TODO: This should be improved - at least rename to LinearSolverSetter
-         return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters );   
+         return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters );
       }
 };
 
@@ -336,7 +336,7 @@ bool SolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
     */
    this->computeTimer.reset();
    this->ioTimer.reset();
-   
+
    /****
     * Create solver monitor thread
     */
diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h
index 5c35c7c33..bc1f43c77 100644
--- a/src/TNL/Solvers/Solver_impl.h
+++ b/src/TNL/Solvers/Solver_impl.h
@@ -15,8 +15,8 @@
 #include <TNL/Solvers/SolverConfig.h>
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 namespace TNL {
 namespace Solvers {
@@ -35,7 +35,7 @@ run( int argc, char* argv[] )
    configDescription.addDelimiter( "Parallelization setup:" );
    Devices::Host::configSetup( configDescription );
    Devices::Cuda::configSetup( configDescription );
-   Communicators::MpiCommunicator::configSetup( configDescription );
+   MPI::configSetup( configDescription );
 
    TNL::MPI::ScopedInitializer mpi( argc, argv );
 
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index 73765aafb..a1b3a8ff3 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -15,8 +15,8 @@
 #include <TNL/Functions/TestFunction.h>
 #include <TNL/Meshes/Grid.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 
 using namespace TNL;
@@ -53,7 +53,7 @@ int main( int argc, char* argv[] )
    Config::ConfigDescription configDescription;
 
    setupConfig( configDescription );
-   Communicators::MpiCommunicator::configSetup( configDescription );
+   TNL::MPI::configSetup( configDescription );
 
    TNL::MPI::ScopedInitializer mpi(argc, argv);
 
diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h
index 8a4024ac6..e78db1153 100644
--- a/src/Tools/tnl-init.h
+++ b/src/Tools/tnl-init.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Functions/TestFunction.h>
@@ -21,8 +22,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-
 using namespace TNL;
 
 template< typename MeshType,
@@ -32,25 +31,23 @@ template< typename MeshType,
           int zDiff >
 bool renderFunction( const Config::ParameterContainer& parameters )
 {
-   using CommunicatorType = Communicators::MpiCommunicator;
-
    using namespace  Meshes::DistributedMeshes;
    using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
    DistributedGridType distributedMesh;
    Pointers::SharedPointer< MeshType > meshPointer;
    MeshType globalMesh;
 
-   if(CommunicatorType::isDistributed())
+   if(TNL::MPI::GetSize() > 1)
    {
        //suppose global mesh loaded from single file
        String meshFile = parameters.getParameter< String >( "mesh" );
        std::cout << "+ -> Loading mesh from " << meshFile << " ... " << std::endl;
        globalMesh.load( meshFile );
-   
+
        // TODO: This should work with no overlaps
-       distributedMesh.template setGlobalGrid<CommunicatorType>(globalMesh);
+       distributedMesh.setGlobalGrid(globalMesh);
        typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-       SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 );
+       SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 );
        distributedMesh.setOverlaps( lowerOverlap, upperOverlap );
        distributedMesh.setupGrid(*meshPointer);
     }
@@ -73,7 +70,7 @@ bool renderFunction( const Config::ParameterContainer& parameters )
    MeshFunctionPointer meshFunction( meshPointer );
    //if( ! discreteFunction.setSize( mesh.template getEntitiesCount< typename MeshType::Cell >() ) )
    //   return false;
- 
+
    double finalTime = parameters.getParameter< double >( "final-time" );
    double initialTime = parameters.getParameter< double >( "initial-time" );
    double tau = parameters.getParameter< double >( "snapshot-period" );
@@ -115,7 +112,7 @@ bool renderFunction( const Config::ParameterContainer& parameters )
       else
         std::cout << "+ -> Writing the function to " << outputFile << " ... " << std::endl;
 
-      if(CommunicatorType::isDistributed())
+      if(TNL::MPI::GetSize() > 1)
       {
          if( ! Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType> ::save(outputFile, *meshFunction ) )
             return false;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
index 54071032c..dccd68f23 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
@@ -1,9 +1,8 @@
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI  
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
@@ -12,30 +11,25 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
-typedef MpiCommunicator CommunicatorType;
-
-template<
-        typename MeshType,
-        typename CommunicatorType>
+template< typename MeshType >
 void SetUpDistributedGrid(DistributedMesh<MeshType> &distributedGrid, MeshType &globalGrid,int size,typename MeshType::CoordinatesType distribution )
 {
     typename MeshType::PointType globalOrigin;
     typename MeshType::PointType globalProportions;
     using DistributedMeshType = DistributedMesh< MeshType >;
-    
+
     globalOrigin.setValue( -0.5 );
     globalProportions.setValue( size );
 
     globalGrid.setDimensions( size );
     globalGrid.setDomain( globalOrigin,globalProportions );
-    
+
     distributedGrid.setDomainDecomposition( distribution );
-    distributedGrid.template setGlobalGrid<CommunicatorType>(globalGrid);
+    distributedGrid.setGlobalGrid(globalGrid);
     typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-    SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+    SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
     distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 }
 
@@ -44,47 +38,47 @@ void SetUpDistributedGrid(DistributedMesh<MeshType> &distributedGrid, MeshType &
 TEST(CutDistributedGirdTest_2D, IsInCut)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
             StaticVector<1,int>(5)
             );
 
-    if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%3==1)
+    if(TNL::MPI::GetRank()%3==1)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_2D, GloblaGridDimesion)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -92,24 +86,24 @@ TEST(CutDistributedGirdTest_2D, GloblaGridDimesion)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
 
 TEST(CutDistributedGirdTest_2D, IsDistributed)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -123,17 +117,17 @@ TEST(CutDistributedGirdTest_2D, IsDistributed)
 TEST(CutDistributedGirdTest_2D, IsNotDistributed)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(12,1));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(12,1));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -149,47 +143,47 @@ TEST(CutDistributedGirdTest_2D, IsNotDistributed)
 TEST(CutDistributedGirdTest_3D, IsInCut_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
             StaticVector<2,int>(2,2)
             );
 
-    if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%4==0)
+    if(TNL::MPI::GetRank()%4==0)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -197,24 +191,24 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
 
 TEST(CutDistributedGirdTest_3D, IsDistributed_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -228,17 +222,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_1D)
 TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -254,48 +248,48 @@ TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D)
 TEST(CutDistributedGirdTest_3D, IsInCut_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
             StaticVector<1,int>(5)
             );
 
-    int rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
+    int rank=TNL::MPI::GetRank();
     if(rank>3 && rank<8)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
@@ -303,7 +297,7 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),2) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().y(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
@@ -311,17 +305,17 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
 TEST(CutDistributedGirdTest_3D, IsDistributed_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
@@ -335,17 +329,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_2D)
 TEST(CutDistributedGirdTest_3D, IsNotDistributed_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
index 4d5bb4baf..9ad46b412 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
@@ -6,7 +6,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Functions/CutMeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -18,9 +17,6 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
-
-typedef MpiCommunicator CommunicatorType;
 
 static const char* TEST_FILE_NAME = "test_CutDistributedMeshFunctionTest.tnl";
 
@@ -52,9 +48,9 @@ TEST(CutDistributedMeshFunction, 2D_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
 
@@ -73,14 +69,14 @@ TEST(CutDistributedMeshFunction, 2D_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -134,9 +130,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -154,14 +150,14 @@ TEST(CutDistributedMeshFunction, 3D_1_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(2),
             StaticVector<2,int>(1,0),
@@ -215,9 +211,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -235,14 +231,14 @@ TEST(CutDistributedMeshFunction, 3D_2_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -302,9 +298,9 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -325,7 +321,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -338,7 +334,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -387,9 +383,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>( globalOriginalGrid );
+   distributedGrid.setGlobalGrid( globalOriginalGrid );
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -410,7 +406,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<2,int>(0,2),
@@ -423,7 +419,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -476,9 +472,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
    overlap.setValue(1);
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -499,7 +495,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -512,7 +508,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -563,9 +559,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
    overlap.setValue(1);
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>( globalOriginalGrid );
+   distributedGrid.setGlobalGrid( globalOriginalGrid );
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -586,7 +582,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -600,9 +596,8 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
         DistributedGridIO<MeshFunctionView<CutMeshType>,MpiIO> ::save(TEST_FILE_NAME, cutMeshFunction );
 
         //save globalgrid for debug render
-        typename CommunicatorType::CommunicationGroup *group;
-        group=(typename CommunicatorType::CommunicationGroup *)(cutDistributedGrid.getCommunicationGroup());
-        if(CommunicatorType::GetRank(*group)==0)
+        MPI_Comm group=cutDistributedGrid.getCommunicationGroup();
+        if(TNL::MPI::GetRank(group)==0)
         {
             File meshFile;
             meshFile.open( TEST_FILE_NAME+String("-mesh.tnl"),std::ios_base::out);
@@ -612,7 +607,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
 
     }
 
-   if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+   if(TNL::MPI::GetRank()==0)
    {
        Pointers::SharedPointer<CutMeshType> globalCutGrid;
        MeshFunctionView<CutMeshType> loadMeshFunctionptr;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
index 640aa5180..6621a01dd 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
@@ -1,11 +1,10 @@
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 #include <TNL/Functions/CutMeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Meshes/Grid.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 
 #include "../../Functions/Functions.h"
 
@@ -14,7 +13,6 @@ using namespace TNL::Containers;
 using namespace TNL::Functions;
 using namespace TNL::Meshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
 TEST(CutMeshFunction, 2D)
@@ -28,12 +26,12 @@ TEST(CutMeshFunction, 2D)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,2> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -43,18 +41,18 @@ TEST(CutMeshFunction, 2D)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<MpiCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(0),
             StaticVector<1,int>(1),
             StaticVector<1,typename CutMeshType::IndexType>(5) );
@@ -62,13 +60,13 @@ TEST(CutMeshFunction, 2D)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
        typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
        typename CutMeshType::Cell outEntity(*cutGrid);
-       
+
         fromEntity.getCoordinates().x()=i;
         fromEntity.getCoordinates().y()=5;
         outEntity.getCoordinates().x()=i;
@@ -91,12 +89,12 @@ TEST(CutMeshFunction, 3D_1)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,3> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -106,18 +104,18 @@ TEST(CutMeshFunction, 3D_1)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<MpiCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<2,int>(0,2),
             StaticVector<2,typename CutMeshType::IndexType>(5,5) );
@@ -125,13 +123,13 @@ TEST(CutMeshFunction, 3D_1)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
        typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
        typename CutMeshType::Cell outEntity(*cutGrid);
-       
+
         fromEntity.getCoordinates().x()=5;
         fromEntity.getCoordinates().y()=i;
         fromEntity.getCoordinates().z()=5;
@@ -154,12 +152,12 @@ TEST(CutMeshFunction, 3D_2)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,3> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -169,18 +167,18 @@ TEST(CutMeshFunction, 3D_2)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<MpiCommunicator, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(2,1),
             StaticVector<1,int>(0),
             StaticVector<1,typename CutMeshType::IndexType>(5) );
@@ -188,7 +186,7 @@ TEST(CutMeshFunction, 3D_2)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
@@ -196,7 +194,7 @@ TEST(CutMeshFunction, 3D_2)
         {
            typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
            typename CutMeshType::Cell outEntity(*cutGrid);
-           
+
             fromEntity.getCoordinates().x()=5;
             fromEntity.getCoordinates().y()=j;
             fromEntity.getCoordinates().z()=i;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
index 6b7c489af..11a85b68d 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
@@ -6,7 +6,6 @@
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
@@ -18,7 +17,6 @@
 using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 
@@ -186,8 +184,6 @@ class ParameterProvider<3,Device>
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, typename Device>
 class TestDistributedGridIO
 {
@@ -227,9 +223,9 @@ class TestDistributedGridIO
         overlap.setValue(1);
         DistributedGridType distributedGrid;
         distributedGrid.setDomainDecomposition( parameters.getDistr() );
-        distributedGrid.template setGlobalGrid<CommunicatorType>( globalGrid );
+        distributedGrid.setGlobalGrid( globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         //std::cout << distributedGrid.printProcessDistr() <<std::endl;
@@ -249,8 +245,8 @@ class TestDistributedGridIO
 
 
        //create similar local mesh function and evaluate linear function on it
-        PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup));
-        PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));;
+        PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank());
+        PointType localProportions=parameters.getProportions(TNL::MPI::GetRank());
 
         Pointers::SharedPointer<MeshType>  localGridptr;
         localGridptr->setDimensions(localProportions);
@@ -313,14 +309,14 @@ class TestDistributedGridIO
         overlap.setValue(1);
         DistributedGridType distributedGrid;
         distributedGrid.setDomainDecomposition( parameters.getDistr() );
-        distributedGrid.template setGlobalGrid<CommunicatorType>( globalGrid );
+        distributedGrid.setGlobalGrid( globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         //save files from local mesh
-        PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup));
-        PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));;
+        PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank());
+        PointType localProportions=parameters.getProportions(TNL::MPI::GetRank());
 
         Pointers::SharedPointer<MeshType> localGridptr;
         localGridptr->setDimensions(localProportions);
@@ -355,7 +351,7 @@ class TestDistributedGridIO
 
         DistributedMeshSynchronizer< DistributedGridType > synchronizer;
         synchronizer.setDistributedGrid( &distributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
+        synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
 
         //Crete "distributedgrid driven" grid filed by evaluated linear function
         Pointers::SharedPointer<MeshType> gridptr;
@@ -367,7 +363,7 @@ class TestDistributedGridIO
         meshFunctionptr->bind(gridptr,dof);
 
         linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
-        synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+        synchronizer.synchronize( *meshFunctionptr );
 
         for(int i=0;i<dof.getSize();i++)
         {
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
index 5bbad8f03..00705c31f 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
@@ -14,7 +14,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 
@@ -24,13 +23,10 @@
 using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, typename Device>
 class TestDistributedGridMPIIO{
     public:
@@ -63,9 +59,9 @@ class TestDistributedGridMPIIO{
         globalGrid->setDomain(globalOrigin,globalProportions);
 
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         ///std::cout << distributedGrid.printProcessDistr() <<std::endl;
@@ -84,7 +80,7 @@ class TestDistributedGridMPIIO{
         DistributedGridIO<MeshFunctionType,MpiIO> ::save(FileName, *meshFunctionptr );
 
        //first process compare results
-       if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+       if(TNL::MPI::GetRank()==0)
        {
             DofType globalEvaluatedDof(globalGrid->template getEntitiesCount< Cell >());
 
@@ -131,15 +127,15 @@ class TestDistributedGridMPIIO{
         CoordinatesType overlap;
         overlap.setValue(1);
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         String FileName=String("test-file-mpiio-load.tnl");
 
         //Prepare file
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+        if(TNL::MPI::GetRank()==0)
         {
             DofType saveDof(globalGrid->template getEntitiesCount< Cell >());
 
@@ -165,7 +161,7 @@ class TestDistributedGridMPIIO{
 
         DistributedMeshSynchronizer< DistributedGridType > synchronizer;
         synchronizer.setDistributedGrid( &distributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
+        synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
 
         Pointers::SharedPointer<MeshType> evalGridPtr;
         Pointers::SharedPointer<MeshFunctionType> evalMeshFunctionptr;
@@ -176,14 +172,14 @@ class TestDistributedGridMPIIO{
         evalMeshFunctionptr->bind(evalGridPtr,evalDof);
 
         linearFunctionEvaluator.evaluateAllEntities(evalMeshFunctionptr , linearFunctionPtr);
-        synchronizer.template synchronize<CommunicatorType>( *evalMeshFunctionptr );
+        synchronizer.synchronize( *evalMeshFunctionptr );
 
         for(int i=0;i<evalDof.getSize();i++)
         {
             EXPECT_EQ( evalDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
 
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+        if(TNL::MPI::GetRank()==0)
         {
             EXPECT_EQ( std::remove( FileName.getString()) , 0 );
         }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
index 7cb44ef22..9a3952bc3 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
@@ -7,12 +7,11 @@
  ***************************************************************************/
 
 
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -26,7 +25,6 @@ using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
 template<typename DofType>
@@ -44,13 +42,13 @@ void check_Boundary_1D(int rank, int nproc, const DofType& dof, typename DofType
         EXPECT_EQ( dof[0], expectedValue) << "Left boundary test failed";
         return;
     }
-    
+
     if(rank==(nproc-1))//Right
     {
         EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Right boundary test failed";
         return;
     }
-    
+
 };
 
 template<typename DofType>
@@ -61,15 +59,15 @@ void check_Overlap_1D(int rank, int nproc, const DofType& dof, typename DofType:
         EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Left boundary node overlap test failed";
         return;
     }
-    
+
     if( rank == ( nproc - 1 ) )
     {
         EXPECT_EQ( dof[0], expectedValue) << "Right boundary node overlap test failed";
         return;
     }
-    
+
     EXPECT_EQ( dof[0], expectedValue) << "left overlap test failed";
-    EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed";    
+    EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed";
 };
 
 template<typename DofType>
@@ -80,25 +78,24 @@ void check_Inner_1D(int rank, int nproc, const DofType& dof, typename DofType::R
 };
 
 /*
- * Light check of 1D distributed grid and its synchronization. 
+ * Light check of 1D distributed grid and its synchronization.
  * Number of process is not limited.
  * Overlap is limited to 1
  * Only double is tested as dof Real type -- it may be changed, extend test
  * Global size is hardcoded as 10 -- it can be changed, extend test
  */
 
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<1,double,Host,int> GridType;
 typedef MeshFunctionView< GridType > MeshFunctionType;
 typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType;
 typedef Vector< double,Host,int> DofType;
 typedef Vector< bool, Host, int > MaskDofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
-     
+
 class DistributedGridTest_1D : public ::testing::Test
 {
    protected:
@@ -123,14 +120,14 @@ class DistributedGridTest_1D : public ::testing::Test
       void SetUp()
       {
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
          GridType globalGrid;
 
-         globalOrigin.x()=-0.5;    
+         globalOrigin.x()=-0.5;
          globalProportions.x()=size;
 
 
@@ -142,9 +139,9 @@ class DistributedGridTest_1D : public ::testing::Test
          distributedGrid=new DistributedGridType();
 
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
-         //distributedGrid->setupGrid(*gridptr);    
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         distributedGrid->setGlobalGrid( globalGrid );
+         //distributedGrid->setupGrid(*gridptr);
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
 
@@ -155,14 +152,14 @@ class DistributedGridTest_1D : public ::testing::Test
 
          constFunctionPtr->Number=rank;
       }
-      
+
       void SetUpPeriodicBoundaries()
       {
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
-         distributedGrid->setupGrid(*gridptr);         
+         distributedGrid->setupGrid(*gridptr);
       }
 
       void TearDown()
@@ -209,7 +206,7 @@ TEST_F(DistributedGridTest_1D, evaluateInteriorEntities)
    check_Boundary_1D(rank, nproc, dof, -1);
    check_Overlap_1D(rank, nproc, dof, -1);
    check_Inner_1D(rank, nproc, dof, rank);
-}    
+}
 
 TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
 {
@@ -217,7 +214,7 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
+   synchronizer.synchronize( *meshFunctionPtr );
 
    if(rank!=0) {
       EXPECT_EQ((dof)[0],rank-1)<< "Left Overlap was filled by wrong process.";
@@ -229,12 +226,12 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
 
 TEST_F(DistributedGridTest_1D, EvaluateLinearFunction )
 {
-   //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) 
+   //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid)
    setDof_1D(dof,-1);
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr);
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
+   synchronizer.synchronize( *meshFunctionPtr );
 
    auto entity = gridptr->template getEntity< Cell >(0);
    entity.refresh();
@@ -250,7 +247,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -258,13 +255,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask )
    maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridptr, dof );
    maskPointer->bind( gridptr, maskDofs );
-   
+
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true );
+   synchronizer.synchronize( *meshFunctionPtr, true );
 
    if( rank == 0 ) {
       EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process.";
@@ -279,7 +276,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -287,14 +284,14 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask )
    maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridptr, dof );
    maskPointer->bind( gridptr, maskDofs );
-   
+
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr, constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true, maskPointer );
+   synchronizer.synchronize( *meshFunctionPtr, true, maskPointer );
    if( rank == 0 ) {
       EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process.";
    }
@@ -310,7 +307,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -325,9 +322,9 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    TNL_MPI_PRINT( "#### " << dof );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
+   meshFunctionPtr->synchronize( true, maskPointer );
    TNL_MPI_PRINT( ">>> " << dof );
-   
+
    if( rank == 0 )
       EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process.";
    if( rank == nproc-1 )
@@ -339,7 +336,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -350,27 +347,27 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask )
 
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
-   maskDofs.setElement( 1, false );   
+   maskDofs.setElement( 1, false );
    maskDofs.setElement( dof.getSize() - 2, false );
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
       EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process.";
    if( rank == nproc-1 )
-      EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process.";   
-   
+      EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process.";
+
 }
 */
 
 TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -382,13 +379,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest )
 
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true );
+   synchronizer.synchronize( *meshFunctionPtr, true );
 
    auto entity = gridptr->template getEntity< Cell >( 0 );
    auto entity2= gridptr->template getEntity< Cell >( (dof).getSize() - 1 );
    entity.refresh();
    entity2.refresh();
-   
+
    if( rank == 0 ) {
       EXPECT_EQ( meshFunctionPtr->getValue(entity), 9 ) << "Linear function Overlap error on left Edge.";
    }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
index 71370cae2..1f02dd236 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
@@ -7,14 +7,13 @@
  ***************************************************************************/
 
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 
@@ -25,10 +24,9 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
- 
+
 
 template<typename DofType>
 void setDof_2D( DofType &dof, typename DofType::RealType value )
@@ -46,7 +44,7 @@ void checkLeftEdge( const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxy;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin;i<end;i++ )
             EXPECT_EQ( dof[maxx*i], expectedValue) << "Left Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -60,8 +58,8 @@ void checkRightEdge(const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxy;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
-    for( int i = begin; i < end; i++ ) 
+
+    for( int i = begin; i < end; i++ )
             EXPECT_EQ( dof[maxx*i+(maxx-1)], expectedValue) << "Right Edge test failed " << i <<" " << maxx << " "<< maxy;
 }
 
@@ -74,7 +72,7 @@ void checkUpEdge( const GridType &grid, const DofType &dof, bool with_first, boo
     int end = maxx;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin; i<end; i++ )
             EXPECT_EQ( dof[i], expectedValue) << "Up Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -88,7 +86,7 @@ void checkDownEdge( const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxx;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin; i<end; i++ )
             EXPECT_EQ( dof[maxx*(maxy-1)+i], expectedValue) << "Down Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -102,7 +100,7 @@ void checkLeftBoundary( const GridType &grid, const DofType &dof, bool with_firs
    int end = maxy - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
+
    for( int i=begin;i<end;i++ )
       EXPECT_EQ( dof[ maxx * i + 1 ], expectedValue) << "Left Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -116,8 +114,8 @@ void checkRightBoundary(const GridType &grid, const DofType &dof, bool with_firs
    int end = maxy - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
-   for( int i = begin; i < end; i++ ) 
+
+   for( int i = begin; i < end; i++ )
      EXPECT_EQ( dof[ maxx * i + ( maxx - 2 ) ], expectedValue) << "Right Edge test failed " << i <<" " << maxx << " "<< maxy;
 }
 
@@ -130,7 +128,7 @@ void checkUpBoundary( const GridType &grid, const DofType &dof, bool with_first,
    int end = maxx - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
+
    for( int i=begin; i<end; i++ )
       EXPECT_EQ( dof[ maxx + i ], expectedValue) << "Up Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -144,7 +142,7 @@ void checkDownBoundary( const GridType &grid, const DofType &dof, bool with_firs
    int end = maxx - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-   
+
    for( int i=begin; i<end; i++ )
       EXPECT_EQ( dof[ maxx * ( maxy-2 ) + i ], expectedValue) << "Down Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -176,51 +174,51 @@ void checkCorner(const GridType &grid, const DofType &dof, bool up, bool left, t
 /*expecting 9 processes*/
 template<typename DofType,typename GridType>
 void check_Boundary_2D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue)
-{    
+{
 
     if(rank==0)//Up Left
     {
         checkUpEdge(grid,dof,true,false,expectedValue);//posledni je overlap
         checkLeftEdge(grid,dof,true,false, expectedValue);//posledni je overlap
     }
-    
+
     if(rank==1)//Up Center
     {
         checkUpEdge(grid,dof,false,false, expectedValue);//prvni a posledni je overlap
     }
-    
+
     if(rank==2)//Up Right
     {
         checkUpEdge(grid,dof,false,true,expectedValue);//prvni je overlap
         checkRightEdge(grid,dof,true,false,expectedValue);//posledni je overlap
     }
-    
+
     if(rank==3)//Center Left
     {
         checkLeftEdge(grid,dof,false,false,expectedValue);//prvni a posledni je overlap
     }
-    
+
     if(rank==4)//Center Center
     {
         //No boundary
     }
-    
+
     if(rank==5)//Center Right
     {
         checkRightEdge(grid,dof,false,false,expectedValue);
     }
-    
+
     if(rank==6)//Down Left
     {
         checkDownEdge(grid,dof,true,false,expectedValue);
         checkLeftEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==7) //Down Center
     {
         checkDownEdge(grid,dof,false,false,expectedValue);
     }
-    
+
     if(rank==8) //Down Right
     {
             checkDownEdge(grid,dof,false,true,expectedValue);
@@ -241,27 +239,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena
         checkRightEdge(grid,dof,false,true,expectedValue);
         checkDownEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==1)//Up Center
     {
         checkDownEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==2)//Up Right
     {
         checkDownEdge(grid,dof,true,false,expectedValue);//prvni je overlap
         checkLeftEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==3)//Center Left
     {
         checkUpEdge(grid,dof,false,true,expectedValue);
         checkDownEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==4)//Center Center
     {
         checkUpEdge(grid,dof,true,true,expectedValue);
@@ -269,27 +267,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena
         checkRightEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==5)//Center Right
     {
         checkUpEdge(grid,dof,true,false,expectedValue);
         checkDownEdge(grid,dof,true,false,expectedValue);
         checkLeftEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==6)//Down Left
     {
         checkUpEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,true,false,expectedValue);
     }
-    
+
     if(rank==7) //Down Center
     {
         checkUpEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,true,false,expectedValue);
         checkRightEdge(grid,dof,true,false,expectedValue);
     }
-    
+
     if(rank==8) //Down Right
     {
         checkUpEdge(grid,dof,true,false,expectedValue);
@@ -310,26 +308,25 @@ void check_Inner_2D(int rank, const GridType& grid, const DofType& dof, typename
 }
 
 /*
- * Light check of 2D distributed grid and its synchronization. 
+ * Light check of 2D distributed grid and its synchronization.
  * expected 9 processes
  */
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<2,double,Host,int> GridType;
 typedef MeshFunctionView<GridType> MeshFunctionType;
 typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType;
 typedef Vector<double,Host,int> DofType;
 typedef Vector< bool, Host, int > MaskDofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
 
 class DistributedGridTest_2D : public ::testing::Test
 {
-    
+
    public:
-      
+
       using CoordinatesType = typename GridType::CoordinatesType;
 
       DistributedGridType *distributedGrid;
@@ -347,20 +344,20 @@ class DistributedGridTest_2D : public ::testing::Test
       Pointers::SharedPointer< LinearFunction<double,2>, Host > linearFunctionPtr;
 
       int rank;
-      int nproc;    
+      int nproc;
 
       void SetUp()
       {
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
          GridType globalGrid;
 
          globalOrigin.x()=-0.5;
-         globalOrigin.y()=-0.5;    
+         globalOrigin.y()=-0.5;
          globalProportions.x()=size;
          globalProportions.y()=size;
 
@@ -369,9 +366,9 @@ class DistributedGridTest_2D : public ::testing::Test
 
          distributedGrid=new DistributedGridType();
          distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3 ) );
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
+         distributedGrid->setGlobalGrid( globalGrid );
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
          distributedGrid->setupGrid(*gridPtr);
@@ -422,17 +419,17 @@ TEST_F(DistributedGridTest_2D, evaluateInteriorEntities)
     check_Boundary_2D(rank, *gridPtr, *dof, -1);
     check_Overlap_2D(rank, *gridPtr, *dof, -1);
     check_Inner_2D(rank, *gridPtr, *dof, rank);
-}    
+}
 
 TEST_F(DistributedGridTest_2D, LinearFunctionTest)
 {
-    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) 
+    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid)
     setDof_2D(*dof,-1);
     linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr);
     Synchronizer synchronizer;
     synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
-    
+    synchronizer.synchronize( *meshFunctionPtr );
+
     int count =gridPtr->template getEntitiesCount< Cell >();
     for(int i=0;i<count;i++)
     {
@@ -449,17 +446,17 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
-    
+   synchronizer.synchronize( *meshFunctionPtr );
+
    // checkNeighbor_2D(rank, *gridPtr, *dof);
-   
+
     if(rank==0)//Up Left
     {
         checkRightEdge(*gridPtr, *dof, true,  false, 1 );
         checkDownEdge( *gridPtr, *dof, true,  false, 3 );
         checkCorner(   *gridPtr, *dof, false, false, 4 );
     }
-    
+
     if(rank==1)//Up Center
     {
         checkLeftEdge( *gridPtr, *dof, true,  false, 0 );
@@ -468,14 +465,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge( *gridPtr, *dof, false, false, 4 );
         checkCorner(   *gridPtr, *dof, false, false, 5 );
     }
-    
+
     if(rank==2)//Up Right
     {
         checkLeftEdge( *gridPtr, *dof, true,  false, 1 );
         checkCorner(   *gridPtr, *dof, false, true,  4 );
         checkDownEdge( *gridPtr, *dof, false, true,  5 );
     }
-    
+
     if(rank==3)//Center Left
     {
         checkUpEdge(    *gridPtr, *dof, true,  false, 0 );
@@ -484,7 +481,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge(  *gridPtr, *dof, true,  false, 6 );
         checkCorner(    *gridPtr, *dof, false, false, 7 );
     }
-    
+
     if(rank==4)//Center Center
     {
         checkCorner(    *gridPtr, *dof, true,  true,  0 );
@@ -496,7 +493,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge(  *gridPtr, *dof, false, false, 7 );
         checkCorner(    *gridPtr, *dof, false, false, 8 );
     }
-    
+
     if(rank==5)//Center Right
     {
         checkCorner(   *gridPtr, *dof, true,  true,  1 );
@@ -505,14 +502,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkCorner(   *gridPtr, *dof, false, true,  7 );
         checkDownEdge( *gridPtr, *dof, false, true,  8 );
     }
-    
+
     if(rank==6)//Down Left
     {
         checkUpEdge(    *gridPtr, *dof, true,  false, 3 );
         checkCorner(    *gridPtr, *dof, true,  false, 4 );
         checkRightEdge( *gridPtr, *dof, false, true,  7 );
     }
-    
+
     if(rank==7) //Down Center
     {
         checkCorner(    *gridPtr, *dof, true,  true,  3 );
@@ -521,77 +518,77 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkLeftEdge(  *gridPtr, *dof, false, true,  6 );
         checkRightEdge( *gridPtr, *dof, false, true,  8 );
     }
-    
+
     if(rank==8) //Down Right
     {
         checkCorner(   *gridPtr, *dof, true,  true, 4 );
         checkUpEdge(   *gridPtr, *dof, false, true, 5 );
         checkLeftEdge( *gridPtr, *dof, false, true, 7 );
-    }   
+    }
 }
 
-// TODO: Fix tests for periodic BC - 
+// TODO: Fix tests for periodic BC -
 // checkLeftBoundary -> checkLeft Overlap etc. for direction BoundaryToOverlap
 // Fix the tests with mask to work with the direction OverlapToBoundary
 /*
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
    dof->setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true );
-   
+   meshFunctionPtr->synchronize( true );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -609,10 +606,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveMask )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -620,13 +617,13 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
+   meshFunctionPtr->synchronize( true, maskPointer );
 
    if( rank == 0 )
    {
@@ -634,39 +631,39 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -684,10 +681,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiveMaskOnLeft )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -695,7 +692,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -711,47 +708,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, 0 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, 3 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  6 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -769,10 +766,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskOnRight )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -780,7 +777,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -796,47 +793,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, 2 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, 5 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -854,10 +851,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskUp )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -865,7 +862,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -881,47 +878,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, 0 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, 1 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, 2 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -939,10 +936,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskDown )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -950,7 +947,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -966,47 +963,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, 6 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -1020,7 +1017,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
       checkRightBoundary( *gridPtr, *dof, true, false, -7 );
    }
 }
-*/ 
+*/
 #endif
 
 #endif
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
index 765341c1e..4f552dee5 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
@@ -1,9 +1,8 @@
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -16,8 +15,7 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
-using namespace TNL::Meshes::DistributedMeshes; 
+using namespace TNL::Meshes::DistributedMeshes;
 
 template<typename DofType>
 void setDof_3D(DofType &dof, typename DofType::RealType value)
@@ -49,14 +47,14 @@ void checkConner(const GridType &grid, const DofType &dof,bool bottom, bool nort
 {
     int i=getAdd(grid,bottom,north,west);
     EXPECT_EQ( dof[i], expectedValue) << "Conner test failed";
-    
+
 }
 
 template<typename DofType,typename GridType>
 void checkXDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool north, typename DofType::RealType expectedValue)
 {
-    int add=getAdd(grid,bottom,north,true);        
-    for(int i=1;i<grid.getDimensions().x()-1;i++) 
+    int add=getAdd(grid,bottom,north,true);
+    for(int i=1;i<grid.getDimensions().x()-1;i++)
             EXPECT_EQ( dof[i+add], expectedValue) << "X direction Edge test failed " << i;
 }
 
@@ -65,7 +63,7 @@ template<typename DofType,typename GridType>
 void checkYDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool west, typename DofType::RealType expectedValue)
 {
     int add=getAdd(grid,bottom,true,west);
-    for(int i=1;i<grid.getDimensions().y()-1;i++) 
+    for(int i=1;i<grid.getDimensions().y()-1;i++)
             EXPECT_EQ( dof[grid.getDimensions().x()*i+add], expectedValue) << "Y direction Edge test failed " << i;
 }
 
@@ -73,7 +71,7 @@ template<typename DofType,typename GridType>
 void checkZDirectionEdge(const GridType &grid, const DofType &dof, bool north, bool west, typename DofType::RealType expectedValue)
 {
     int add=getAdd(grid,true,north,west);
-    for(int i=1;i<grid.getDimensions().z()-1;i++) 
+    for(int i=1;i<grid.getDimensions().z()-1;i++)
             EXPECT_EQ( dof[grid.getDimensions().y()*grid.getDimensions().x()*i+add], expectedValue) << "Z direction Edge test failed " << i;
 }
 
@@ -125,7 +123,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
         checkZFace(grid, dof, true, expectedValue);
-    }    
+    }
 
     if(rank==1)//Bottom North Center
     {
@@ -199,7 +197,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkZDirectionEdge(grid,dof,true,true,expectedValue);
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
-    }    
+    }
 
     if(rank==10)//Center North Center
     {
@@ -257,7 +255,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
         checkZFace(grid, dof, false, expectedValue);
-    }    
+    }
 
     if(rank==19)//Top North Center
     {
@@ -406,8 +404,8 @@ void CheckXFaceNode_Overlap(const GridType &grid, const DofType &dof,bool west,
    checkXFace(grid, dof, !west, expectedValue);
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
-   checkZFace(grid, dof, false, expectedValue);    
-   checkZFace(grid, dof, true, expectedValue);        
+   checkZFace(grid, dof, false, expectedValue);
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -429,7 +427,7 @@ void CheckYFaceNode_Overlap(const GridType &grid, const DofType &dof,bool north,
    checkXFace(grid, dof, true, expectedValue);
    checkYFace(grid, dof, !north, expectedValue);
    checkZFace(grid, dof, false, expectedValue);
-   checkZFace(grid, dof, true, expectedValue);    
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -451,7 +449,7 @@ void CheckZFaceNode_Overlap(const GridType &grid, const DofType &dof,bool bottom
    checkXFace(grid, dof, true, expectedValue);
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
-   checkZFace(grid, dof, !bottom, expectedValue);    
+   checkZFace(grid, dof, !bottom, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -484,11 +482,11 @@ void CheckCentralNode_Overlap(const GridType &grid, const DofType &dof,typename
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
    checkZFace(grid, dof, false, expectedValue);
-   checkZFace(grid, dof, true, expectedValue);    
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 /*
-* Expected 27 processes. 
+* Expected 27 processes.
 */
 template<typename DofType,typename GridType>
 void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue)
@@ -499,7 +497,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena
    if(rank==1)
        CheckXEdgeNode_Overlap(grid,dof,true,true,expectedValue);
 
-   if(rank==2)    
+   if(rank==2)
        CheckConnerNode_Overlap(grid,dof,true,true,false,expectedValue);
 
    if(rank==3)
@@ -553,7 +551,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena
    if(rank==19)
        CheckXEdgeNode_Overlap(grid,dof,false,true,expectedValue);
 
-   if(rank==20)    
+   if(rank==20)
        CheckConnerNode_Overlap(grid,dof,false,true,false,expectedValue);
 
    if(rank==21)
@@ -590,19 +588,18 @@ void check_Inner_3D(int rank, const GridType& grid, const DofType& dof, typename
 
 
 /*
- * Light check of 3D distributed grid and its synchronization. 
+ * Light check of 3D distributed grid and its synchronization.
  * expected 27 processes
  */
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<3,double,Host,int> GridType;
 typedef MeshFunctionView<GridType> MeshFunctionType;
 typedef Vector<double,Host,int> DofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
-     
+
 class DistributedGirdTest_3D : public ::testing::Test
 {
    protected:
@@ -620,14 +617,14 @@ class DistributedGirdTest_3D : public ::testing::Test
       Pointers::SharedPointer< LinearFunction<double,3>, Host > linearFunctionPtr;
 
       int rank;
-      int nproc;    
+      int nproc;
 
       void SetUp()
       {
 
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
@@ -635,7 +632,7 @@ class DistributedGirdTest_3D : public ::testing::Test
 
          globalOrigin.x()=-0.5;
          globalOrigin.y()=-0.5;
-         globalOrigin.z()=-0.5;    
+         globalOrigin.z()=-0.5;
          globalProportions.x()=size;
          globalProportions.y()=size;
          globalProportions.z()=size;
@@ -645,17 +642,17 @@ class DistributedGirdTest_3D : public ::testing::Test
 
          distributedGrid=new DistributedGridType();
          distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3, 3 ) );
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
-         distributedGrid->setupGrid(*gridptr);    
+         distributedGrid->setGlobalGrid( globalGrid );
+         distributedGrid->setupGrid(*gridptr);
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
 
          distributedGrid->setupGrid(*gridptr);
          dof=new DofType(gridptr->template getEntitiesCount< Cell >());
 
-         meshFunctionptr->bind(gridptr,*dof);   
+         meshFunctionptr->bind(gridptr,*dof);
          constFunctionPtr->Number=rank;
       }
 
@@ -697,17 +694,17 @@ TEST_F(DistributedGirdTest_3D, evaluateInteriorEntities)
     check_Boundary_3D(rank, *gridptr, *dof, -1);
     check_Overlap_3D(rank, *gridptr, *dof, -1);
     check_Inner_3D(rank, *gridptr, *dof, rank);
-}   
+}
 
 TEST_F(DistributedGirdTest_3D, LinearFunctionTest)
 {
-    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) 
+    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid)
     setDof_3D(*dof,-1);
     linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr, linearFunctionPtr);
     Synchronizer synchronizer;
     synchronizer.setDistributedGrid( meshFunctionptr->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
-    
+    synchronizer.synchronize( *meshFunctionptr );
+
     int count =gridptr->template getEntitiesCount< Cell >();
     for(int i=0;i<count;i++)
     {
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
index f35ec8e08..d6791e1df 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
@@ -1,4 +1,3 @@
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Functions/VectorField.h>
@@ -17,13 +16,10 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, int vctdim, typename Device>
 class TestDistributedVectorFieldMPIIO{
     public:
@@ -33,8 +29,8 @@ class TestDistributedVectorFieldMPIIO{
 	typedef VectorField<vctdim,MeshFunctionType> VectorFieldType;
     typedef Vector<double,Device,int> DofType;
     typedef typename MeshType::Cell Cell;
-    typedef typename MeshType::IndexType IndexType; 
-    typedef typename MeshType::PointType PointType; 
+    typedef typename MeshType::IndexType IndexType;
+    typedef typename MeshType::PointType PointType;
     typedef DistributedMesh<MeshType> DistributedGridType;
 
     typedef typename DistributedGridType::CoordinatesType CoordinatesType;
@@ -43,8 +39,8 @@ class TestDistributedVectorFieldMPIIO{
     static void TestSave()
     {
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
-        
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;
+
         //save distributed meshfunction into file
         PointType globalOrigin;
         globalOrigin.setValue(-0.5);
@@ -55,14 +51,14 @@ class TestDistributedVectorFieldMPIIO{
         Pointers::SharedPointer<MeshType> globalGrid;
         globalGrid->setDimensions(globalProportions);
         globalGrid->setDomain(globalOrigin,globalProportions);
-        
+
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
 
-        Pointers::SharedPointer<MeshType> gridptr;        
+        Pointers::SharedPointer<MeshType> gridptr;
         distributedGrid.setupGrid(*gridptr);
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
         distributedGrid.setupGrid(*gridptr);
 
@@ -74,10 +70,10 @@ class TestDistributedVectorFieldMPIIO{
         DofType dof(vctdim*(gridptr->template getEntitiesCount< Cell >()));
         dof.setValue(0);
         vectorField.bind(gridptr,dof);
-            
+
 		for(int i=0;i<vctdim;i++)
 	        linearFunctionEvaluator.evaluateAllEntities(vectorField [ i ], linearFunctionPtr);
- 
+
         String FileName=String("/tmp/test-file.tnl");
         DistributedGridIO_VectorField<VectorFieldType,MpiIO> ::save(FileName, vectorField );
         /*File file;
@@ -86,7 +82,7 @@ class TestDistributedVectorFieldMPIIO{
 		file.close();		*/
 
        //first process compare results
-       if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+       if(TNL::MPI::GetRank()==0)
        {
             DofType globalEvaluatedDof(vctdim*(globalGrid->template getEntitiesCount< Cell >()));
 
@@ -101,7 +97,7 @@ class TestDistributedVectorFieldMPIIO{
             loadvct.bind(globalGrid,loadDof);
 
             loadDof.setValue(-1);
-        
+
             File file;
             file.open( FileName, std::ios_base::in );
 	    loadvct.boundLoad(file);
@@ -111,13 +107,13 @@ class TestDistributedVectorFieldMPIIO{
 	    }
        }
     };
-    
+
     static void TestLoad()
     {
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;
 
-        //Crete distributed grid            
+        //Crete distributed grid
         PointType globalOrigin;
         globalOrigin.setValue(-0.5);
 
@@ -131,26 +127,26 @@ class TestDistributedVectorFieldMPIIO{
         CoordinatesType overlap;
         overlap.setValue(1);
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>(*globalGrid);
+        distributedGrid.setGlobalGrid(*globalGrid);
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
 
-        String FileName=String("/tmp/test-file.tnl");         
+        String FileName=String("/tmp/test-file.tnl");
 
-        //Prepare file   
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
-        {   
+        //Prepare file
+        if(TNL::MPI::GetRank()==0)
+        {
             DofType saveDof(vctdim*(globalGrid->template getEntitiesCount< Cell >()));
 
             VectorFieldType saveVectorField;
             saveVectorField.bind(globalGrid,saveDof);
             for(int i=0;i<vctdim;i++)
                 linearFunctionEvaluator.evaluateAllEntities(saveVectorField[i] , linearFunctionPtr);
-      
+
             File file;
-            file.open( FileName, std::ios_base::out );        
+            file.open( FileName, std::ios_base::out );
             saveVectorField.save(file);
             file.close();
         }
@@ -158,7 +154,7 @@ class TestDistributedVectorFieldMPIIO{
         Pointers::SharedPointer<MeshType> loadGridptr;
         VectorFieldType loadVectorField;
         distributedGrid.setupGrid(*loadGridptr);
-        
+
         DofType loadDof(vctdim*(loadGridptr->template getEntitiesCount< Cell >()));
         loadDof.setValue(0);
         loadVectorField.bind(loadGridptr,loadDof);
@@ -169,26 +165,26 @@ class TestDistributedVectorFieldMPIIO{
         synchronizer.setDistributedGrid( &distributedGrid );
 
         for(int i=0;i<vctdim;i++)
-            synchronizer.template synchronize<CommunicatorType>(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof
+            synchronizer.synchronize(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof
 
         Pointers::SharedPointer<MeshType> evalGridPtr;
         VectorFieldType evalVectorField;
         distributedGrid.setupGrid(*evalGridPtr);
-        
+
         DofType evalDof(vctdim*(evalGridPtr->template getEntitiesCount< Cell >()));
         evalDof.setValue(-1);
         evalVectorField.bind(evalGridPtr,evalDof);
-        
+
         for(int i=0;i<vctdim;i++)
         {
-            linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr);        
-            synchronizer.template synchronize<CommunicatorType>(*evalVectorField[i]);
+            linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr);
+            synchronizer.synchronize(*evalVectorField[i]);
         }
 
         for(int i=0;i<evalDof.getSize();i++)
         {
             EXPECT_EQ( evalDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
-        
+
     }
 };
-- 
GitLab


From cc3b9348255ef44300f186df3b0b6b5419cb7667 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 15:09:35 +0100
Subject: [PATCH 47/50] MPI refactoring: replaced DimsCreate and CreateNewGroup
 in MpiCommunicator with plain function wrappers

---
 src/TNL/Communicators/MpiCommunicator.h | 41 --------------------
 src/TNL/MPI/DummyDefs.h                 | 12 ++++++
 src/TNL/MPI/Wrappers.h                  | 50 +++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index c155cabbe..47392ca60 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -175,47 +175,6 @@ class MpiCommunicator
          MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
-
-      //dim-number of dimensions, distr array of guess distr - 0 for computation
-      //distr array will be filled by computed distribution
-      //more information in MPI documentation
-      static void DimsCreate(int nproc, int dim, int *distr)
-      {
-#ifdef HAVE_MPI
-         int sum = 0, prod = 1;
-         for( int i = 0;i < dim; i++ ) {
-            sum += distr[ i ];
-            prod *= distr[ i ];
-         }
-         if( prod != 0 && prod != GetSize( AllGroup ) )
-            throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions."
-                                    "Non of the dimensions is zero and product of all dimensions does "
-                                    "not fit with number of MPI processes." );
-         if(sum==0) {
-            for(int i=0;i<dim-1;i++)
-               distr[i]=1;
-            distr[dim-1]=0;
-         }
-
-         MPI_Dims_create(nproc, dim, distr);
-#else
-         for(int i=0;i<dim;i++)
-            distr[i]=1;
-#endif
-      }
-
-      static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup )
-      {
-#ifdef HAVE_MPI
-         if(meToo)
-            MPI_Comm_split(oldGroup, 1, myRank, &newGroup);
-         else
-            MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup);
-#else
-         newGroup=oldGroup;
-#endif
-      }
-
 #ifdef HAVE_MPI
       static MPI_Comm AllGroup;
       static MPI_Comm NullGroup;
diff --git a/src/TNL/MPI/DummyDefs.h b/src/TNL/MPI/DummyDefs.h
index cdd5ea483..578e46dfe 100644
--- a/src/TNL/MPI/DummyDefs.h
+++ b/src/TNL/MPI/DummyDefs.h
@@ -36,4 +36,16 @@ enum {
    MPI_THREAD_SERIALIZED,
    MPI_THREAD_MULTIPLE
 };
+
+// Miscellaneous constants
+#define MPI_ANY_SOURCE         -1                      /* match any source rank */
+#define MPI_PROC_NULL          -2                      /* rank of null process */
+#define MPI_ROOT               -4                      /* special value for intercomms */
+#define MPI_ANY_TAG            -1                      /* match any message tag */
+#define MPI_UNDEFINED          -32766                  /* undefined stuff */
+#define MPI_DIST_GRAPH         3                       /* dist graph topology */
+#define MPI_CART               1                       /* cartesian topology */
+#define MPI_GRAPH              2                       /* graph topology */
+#define MPI_KEYVAL_INVALID     -1                      /* invalid key value */
+
 #endif
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
index 39344a128..8a455dcb7 100644
--- a/src/TNL/MPI/Wrappers.h
+++ b/src/TNL/MPI/Wrappers.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <iostream>
+#include <stdexcept>
 
 #ifdef HAVE_MPI
    #include <mpi.h>
@@ -156,6 +157,55 @@ inline int GetSize( MPI_Comm group = AllGroup() )
 #endif
 }
 
+// wrappers for MPI helper functions
+
+inline MPI_Comm Comm_split( MPI_Comm comm, int color, int key )
+{
+#ifdef HAVE_MPI
+   MPI_Comm newcomm;
+   MPI_Comm_split( comm, color, key, &newcomm );
+   return newcomm;
+#else
+   return comm;
+#endif
+}
+
+/**
+ * \brief Wrapper for \ref MPI_Dims_create.
+ *
+ * \param nproc - number of processes in the group to be distributed
+ * \param ndims - number of dimensions of the Cartesian grid
+ * \param dims - distribution of processes into the \e dim-dimensional
+ *               Cartesian grid (array of length \e ndims)
+ *
+ * Negative input values of \e dims[i] are erroneous. An error will occur if
+ * \e nproc is not a multiple of the product of all non-zero values \e dims[i].
+ *
+ * See the MPI documentation for more information.
+ */
+inline void Compute_dims( int nproc, int ndims, int* dims )
+{
+#ifdef HAVE_MPI
+   int prod = 1;
+   for( int i = 0; i < ndims; i++ ) {
+      if( dims[ i ] < 0 )
+         throw std::invalid_argument( "Negative value passed to MPI::Compute_dims in the dims array argument." );
+      if( dims[ i ] > 0 )
+         prod *= dims[ i ];
+   }
+
+   if( nproc % prod != 0 )
+      throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions."
+            "The product of the non-zero values dims[i] is " + std::to_string(prod) + " and the "
+            "number of processes (" + std::to_string(nproc) + ") is not a multiple of the product." );
+
+   MPI_Dims_create( nproc, ndims, dims );
+#else
+   for( int i = 0; i < ndims; i++)
+      dims[ i ] = 1;
+#endif
+}
+
 // wrappers for MPI communication functions
 
 inline void Barrier( MPI_Comm group = AllGroup() )
-- 
GitLab


From 60ee5cd078e372949d55b769f4a0c66c0c961f24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 15:50:25 +0100
Subject: [PATCH 48/50] MPI refactoring: removed unit tests for MpiCommunicator
 and marked it as deprecated

---
 src/TNL/Communicators/MpiCommunicator.h       |  3 +-
 src/UnitTests/CMakeLists.txt                  |  1 -
 src/UnitTests/Communicators/CMakeLists.txt    |  9 ----
 .../Communicators/MpiCommunicatorTest.cpp     | 51 -------------------
 .../DistributedVectorFieldIO_MPIIOTest.cpp    |  5 --
 src/UnitTests/main_mpi.h                      |  5 +-
 6 files changed, 4 insertions(+), 70 deletions(-)
 delete mode 100644 src/UnitTests/Communicators/CMakeLists.txt
 delete mode 100644 src/UnitTests/Communicators/MpiCommunicatorTest.cpp

diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 47392ca60..cd5162968 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -21,7 +21,8 @@ namespace Communicators {
 namespace {
 
 //! \brief MPI communicator.
-class MpiCommunicator
+class [[deprecated("use the functions in the TNL::MPI namespace instead")]]
+MpiCommunicator
 {
    public:
 #ifdef HAVE_MPI
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index 8e4ac7249..2c0ba8650 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -1,4 +1,3 @@
-ADD_SUBDIRECTORY( Communicators )
 ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Functions )
 # Matrices are included from src/CMakeLists.txt
diff --git a/src/UnitTests/Communicators/CMakeLists.txt b/src/UnitTests/Communicators/CMakeLists.txt
deleted file mode 100644
index 1a3331c3a..000000000
--- a/src/UnitTests/Communicators/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-if( ${BUILD_MPI} )
-   ADD_EXECUTABLE( MpiCommunicatorTest MpiCommunicatorTest.cpp )
-   TARGET_COMPILE_OPTIONS( MpiCommunicatorTest PRIVATE ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( MpiCommunicatorTest ${GTEST_BOTH_LIBRARIES} )
-
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MpiCommunicatorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME MpiCommunicatorTest COMMAND "mpirun" ${mpi_test_parameters})
-
-endif()
diff --git a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp b/src/UnitTests/Communicators/MpiCommunicatorTest.cpp
deleted file mode 100644
index b78011953..000000000
--- a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/***************************************************************************
-                          MpiCommunicatorTest.h  -  description
-                             -------------------
-    begin                : Jul 10, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef HAVE_GTEST
-
-#include "gtest/gtest.h"
-#include <TNL/Communicators/MpiCommunicator.h>
-
-using namespace TNL;
-using namespace TNL::Communicators;
-
-// test fixture for typed tests
-template< typename Real >
-class MpiCommunicatorTest : public ::testing::Test
-{
-   protected:
-      using RealType = Real;
-      using CommunicatorType = MpiCommunicator;
-};
-
-// types for which MpiCommunicatorTest is instantiated
-using MpiCommunicatorTypes = ::testing::Types<
-   short,
-   int,
-   long,
-   float,
-   double
->;
-
-TYPED_TEST_SUITE( MpiCommunicatorTest, MpiCommunicatorTypes );
-
-TYPED_TEST( MpiCommunicatorTest, allReduce )
-{
-   using RealType = typename TestFixture::RealType;
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-   RealType a = CommunicatorType::GetRank();
-   RealType b = 0;
-   CommunicatorType::Allreduce( &a, &b, 1, MPI_MAX, MPI_COMM_WORLD );
-   EXPECT_EQ( b, CommunicatorType::GetSize() - 1  );
-}
-
-#endif // HAVE_GTEST
-
-#include "../main_mpi.h"
\ No newline at end of file
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
index 0a5ab3e37..9bdccbcdb 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
@@ -2,13 +2,8 @@
       #include <gtest/gtest.h>
 #ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include "DistributedVectorFieldIO_MPIIOTestBase.h"
 
-using namespace TNL::Communicators;
-
-typedef MpiCommunicator CommunicatorType;
-
 TEST( DistributedVectorFieldIO_MPIIO, Save_1D )
 {
     TestDistributedVectorFieldMPIIO<1,2,Host>::TestSave();
diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h
index 4c89b60ba..d22f6d3eb 100644
--- a/src/UnitTests/main_mpi.h
+++ b/src/UnitTests/main_mpi.h
@@ -6,9 +6,8 @@
 #endif
 
 #if (defined(HAVE_GTEST) && defined(HAVE_MPI))
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/MPI/ScopedInitializer.h>
-using CommunicatorType = TNL::Communicators::MpiCommunicator;
+#include <TNL/MPI/Wrappers.h>
 
 #include <sstream>
 
@@ -37,7 +36,7 @@ public:
    // Called after a test ends.
    virtual void OnTestEnd(const ::testing::TestInfo& test_info)
    {
-      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      const int rank = TNL::MPI::GetRank();
       sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
       std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
       sout.str( std::string() );
-- 
GitLab


From 31fa61f9ccd8ba5c50ec9f1b65d11efd238b1f3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 2 Jan 2021 15:52:57 +0100
Subject: [PATCH 49/50] MPI refactoring: cleaned up benchmarks

---
 src/Benchmarks/Benchmarks.h               |  8 +++----
 src/Benchmarks/ODESolvers/SimpleProblem.h | 10 ++++----
 src/Benchmarks/ODESolvers/benchmarks.h    | 29 +----------------------
 3 files changed, 8 insertions(+), 39 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index cbd628b03..2b2389e2c 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -26,7 +26,7 @@
 #include <TNL/SystemInfo.h>
 #include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Config/ConfigDescription.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Benchmarks {
@@ -55,7 +55,7 @@ struct BenchmarkResult
       elements << time << stddev << stddev / time << bandwidth;
       if( speedup != 0 )
          elements << speedup;
-      else 
+      else
          elements << "N/A";
       return elements;
    }
@@ -356,9 +356,7 @@ inline Benchmark::MetadataMap getHardwareMetadata()
        { "system release", SystemInfo::getSystemRelease() },
        { "start time", SystemInfo::getCurrentTime() },
 #ifdef HAVE_MPI
-       { "number of MPI processes", convertToString( (Communicators::MpiCommunicator::IsInitialized())
-                                       ? Communicators::MpiCommunicator::GetSize( Communicators::MpiCommunicator::AllGroup )
-                                       : 1 ) },
+       { "number of MPI processes", convertToString( TNL::MPI::GetSize() ) },
 #endif
        { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) },
        { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) },
diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h
index 122606a32..65f769dda 100644
--- a/src/Benchmarks/ODESolvers/SimpleProblem.h
+++ b/src/Benchmarks/ODESolvers/SimpleProblem.h
@@ -14,11 +14,10 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
    namespace Benchmarks {
-      
+
 template< typename Real = double,
    typename Device = Devices::Host,
    typename Index = int >
@@ -28,8 +27,7 @@ struct SimpleProblem
    using DeviceType = Device;
    using IndexType = Index;
    using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >;
-   using CommunicatorType = Communicators::MpiCommunicator;
-   
+
    template< typename VectorPointer >
    void getExplicitUpdate( const RealType& time,
       const RealType& tau,
@@ -46,10 +44,10 @@ struct SimpleProblem
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu );
    }
-   
+
    template< typename Vector >
    void applyBoundaryConditions( const RealType& t, Vector& u ) {};
-      
+
 };
 
    } // namespace Benchmarks
diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h
index 60b533663..a6ee67a62 100644
--- a/src/Benchmarks/ODESolvers/benchmarks.h
+++ b/src/Benchmarks/ODESolvers/benchmarks.h
@@ -16,8 +16,6 @@
 #include <TNL/Config/ParameterContainer.h>
 
 #include "../Benchmarks.h"
-#include "SimpleProblem.h"
-
 
 #include <stdexcept>  // std::runtime_error
 
@@ -35,31 +33,6 @@ getPerformer()
    return "CPU";
 }
 
-/*template< typename Matrix >
-void barrier( const Matrix& matrix )
-{
-}
-
-template< typename Matrix, typename Communicator >
-void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix )
-{
-   Communicator::Barrier( matrix.getCommunicationGroup() );
-}*/
-
-template< typename Device >
-bool checkDevice( const Config::ParameterContainer& parameters )
-{
-   const String device = parameters.getParameter< String >( "device" );
-   if( device == "all" )
-      return true;
-   if( std::is_same< Device, Devices::Host >::value && device == "host" )
-      return true;
-   if( std::is_same< Device, Devices::Cuda >::value && device == "cuda" )
-      return true;
-   return false;
-}
-
-
 template< typename Solver, typename VectorPointer >
 void
 benchmarkSolver( Benchmark& benchmark,
@@ -90,7 +63,7 @@ benchmarkSolver( Benchmark& benchmark,
    auto compute = [&]() {
       solver.solve( u );
    };
-   
+
    // subclass BenchmarkResult to add extra columns to the benchmark
    // (iterations, preconditioned residue, true residue)
    /*struct MyBenchmarkResult : public BenchmarkResult
-- 
GitLab


From fb3807b59c1e2b0588e297c257c7238e51c4ef9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 23 Jan 2021 12:02:23 +0100
Subject: [PATCH 50/50] Fixed the index of tutorials

---
 Documentation/Tutorials/index.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 56a51cc22..55b92ad81 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,11 +2,10 @@
 
 ## Tutorials
 
-1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
-2. [General concepts](tutorial_GeneralConcepts.html)
-3. [Arrays](tutorial_Arrays.html)
-4. [Vectors](tutorial_Vectors.html)
-5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-6. [For loops](tutorial_ForLoops.html)
-7. [Cross-device pointers](tutorial_Pointers.html)
-8. [Matrices](tutorial_Matrices.html)
+1. [General concepts](tutorial_GeneralConcepts.html)
+2. [Arrays](tutorial_Arrays.html)
+3. [Vectors](tutorial_Vectors.html)
+4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+5. [For loops](tutorial_ForLoops.html)
+6. [Cross-device pointers](tutorial_Pointers.html)
+7. [Matrices](tutorial_Matrices.html)
-- 
GitLab