From 9a88469e711e804d98aff30c1538118989ab5df4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 29 Dec 2020 21:11:37 +0100
Subject: [PATCH] MPI refactoring: split MpiCommunicator into plain functions
 in the TNL::MPI namespace

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |   4 +-
 .../tnl-benchmark-linear-solvers.h            |   4 +-
 .../ODESolvers/tnl-benchmark-ode-solvers.h    |  10 +-
 src/TNL/Communicators/MPITypeResolver.h       | 108 ------
 src/TNL/Communicators/MpiCommunicator.h       | 273 ++------------
 src/TNL/Containers/DistributedArray.hpp       |   1 -
 .../Expressions/DistributedComparison.h       |   2 +-
 .../DistributedVerticalOperations.h           |   2 +-
 src/TNL/MPI.h                                 |  29 ++
 .../MpiDefs.h => MPI/DummyDefs.h}             |  17 +-
 .../{Communicators/MPIPrint.h => MPI/Print.h} |  55 ++-
 .../ScopedInitializer.h                       |  15 +-
 src/TNL/MPI/Utils.h                           |  46 +++
 src/TNL/MPI/Wrappers.h                        | 347 ++++++++++++++++++
 src/TNL/MPI/getDataType.h                     | 119 ++++++
 src/TNL/MPI/selectGPU.h                       |  72 ++++
 .../DistributedMeshes/BufferEntitiesHelper.h  |   1 -
 .../DistributedGridIO_MeshFunction.h          |  71 ++--
 .../DistributedGridSynchronizer.h             |   1 -
 src/TNL/Solvers/Solver_impl.h                 |   6 +-
 src/Tools/tnl-game-of-life.cpp                |   4 +-
 src/Tools/tnl-init.cpp                        |   4 +-
 src/Tools/tnl-test-distributed-mesh.h         |   4 +-
 .../DistributedNDArrayOverlaps_1D_test.h      |   1 -
 .../DistributedNDArrayOverlaps_semi1D_test.h  |   1 -
 .../ndarray/DistributedNDArray_1D_test.h      |   1 -
 .../ndarray/DistributedNDArray_semi1D_test.h  |   1 -
 .../DistributedMeshes/DistributedMeshTest.h   |   1 -
 src/UnitTests/main_mpi.h                      |   4 +-
 29 files changed, 740 insertions(+), 464 deletions(-)
 delete mode 100644 src/TNL/Communicators/MPITypeResolver.h
 create mode 100644 src/TNL/MPI.h
 rename src/TNL/{Communicators/MpiDefs.h => MPI/DummyDefs.h} (64%)
 rename src/TNL/{Communicators/MPIPrint.h => MPI/Print.h} (75%)
 rename src/TNL/{Communicators => MPI}/ScopedInitializer.h (72%)
 create mode 100644 src/TNL/MPI/Utils.h
 create mode 100644 src/TNL/MPI/Wrappers.h
 create mode 100644 src/TNL/MPI/getDataType.h
 create mode 100644 src/TNL/MPI/selectGPU.h

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 74a3205d35..abe08210d0 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -20,7 +20,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -309,7 +309,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 06ba2bc946..75b1e0e258 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -25,7 +25,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -592,7 +592,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index aa4370c7ac..fcaaaedf21 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -24,7 +24,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
 
@@ -63,7 +63,7 @@ benchmarkODESolvers( Benchmark& benchmark,
 #ifdef HAVE_CUDA
       CudaVectorPointer cuda_u( dofs );
       *cuda_u = 0.0;
-#endif      
+#endif
       if( solver == "euler" || solver == "all" ) {
          using HostSolver = Solvers::ODE::Euler< HostProblem, SolverMonitorType >;
          benchmark.setOperation("Euler");
@@ -168,10 +168,10 @@ bool resolveRealTypes( Benchmark& benchmark,
    Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
-   if( ( realType == "float" || realType == "all" ) && 
+   if( ( realType == "float" || realType == "all" ) &&
        ! resolveIndexType< float >( benchmark, metadata, parameters ) )
       return false;
-   if( ( realType == "double" || realType == "all" ) && 
+   if( ( realType == "double" || realType == "all" ) &&
        ! resolveIndexType< double >( benchmark, metadata, parameters ) )
       return false;
    return true;
@@ -225,7 +225,7 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
    const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
diff --git a/src/TNL/Communicators/MPITypeResolver.h b/src/TNL/Communicators/MPITypeResolver.h
deleted file mode 100644
index 5429d5e33c..0000000000
--- a/src/TNL/Communicators/MPITypeResolver.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
-                          MPITypeResolver.h  -  description
-                             -------------------
-    begin                : Feb 4, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Communicators {
-
-#ifdef HAVE_MPI
-template<typename Type>
-struct MPITypeResolver
-{
-   static inline MPI_Datatype getType()
-   {
-      static_assert( sizeof(Type) == sizeof(char) ||
-                     sizeof(Type) == sizeof(int) ||
-                     sizeof(Type) == sizeof(short int) ||
-                     sizeof(Type) == sizeof(long int),
-                     "Fatal Error - Unknown MPI Type");
-      switch( sizeof( Type ) )
-      {
-         case sizeof( char ):
-            return MPI_CHAR;
-         case sizeof( int ):
-            return MPI_INT;
-         case sizeof( short int ):
-            return MPI_SHORT;
-         case sizeof( long int ):
-            return MPI_LONG;
-      }
-      // this will never happen thanks to the static_assert above, but icpc is not that smart
-      // and complains about missing return statement at the end of non-void function
-      throw 0;
-   }
-};
-
-template<> struct MPITypeResolver< char >
-{
-    static inline MPI_Datatype getType(){return MPI_CHAR;};
-};
-
-template<> struct MPITypeResolver< int >
-{
-    static inline MPI_Datatype getType(){return MPI_INT;};
-};
-
-template<> struct MPITypeResolver< short int >
-{
-    static inline MPI_Datatype getType(){return MPI_SHORT;};
-};
-
-template<> struct MPITypeResolver< long int >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG;};
-};
-
-template<> struct MPITypeResolver< unsigned char >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
-};
-
-template<> struct MPITypeResolver< unsigned short int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
-};
-
-template<> struct MPITypeResolver< unsigned int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
-};
-
-template<> struct MPITypeResolver< unsigned long int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
-};
-
-template<> struct MPITypeResolver< float >
-{
-    static inline MPI_Datatype getType(){return MPI_FLOAT;};
-};
-
-template<> struct MPITypeResolver< double >
-{
-    static inline MPI_Datatype getType(){return MPI_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< long double >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< bool >
-{
-   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
-   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
-   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
-};
-#endif
-
-} // namespace Communicators
-} // namespace TNL
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index dedc35f03a..1995978c5e 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -11,36 +11,23 @@
 #pragma once
 
 #include <iostream>
-#include <fstream>
-#include <cstring>
 
 #ifdef HAVE_MPI
-#include <mpi.h>
 #ifdef OMPI_MAJOR_VERSION
    // header specific to OpenMPI (needed for CUDA-aware detection)
    #include <mpi-ext.h>
 #endif
 
 #include <unistd.h>  // getpid
-
-#ifdef HAVE_CUDA
-    #include <TNL/Cuda/CheckDevice.h>
-
-    typedef struct __attribute__((__packed__))  {
-       char name[MPI_MAX_PROCESSOR_NAME];
-    } procName;
-#endif
-
 #endif
 
 #include <TNL/String.h>
 #include <TNL/Logger.h>
-#include <TNL/Debugging/OutputRedirection.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/Wrappers.h>
+#include <TNL/MPI/DummyDefs.h>
+#include <TNL/MPI/Utils.h>
 #include <TNL/Config/ConfigDescription.h>
-#include <TNL/Exceptions/MPISupportMissing.h>
 #include <TNL/Exceptions/MPIDimsCreateError.h>
-#include <TNL/Communicators/MPITypeResolver.h>
 
 
 namespace TNL {
@@ -88,7 +75,7 @@ class MpiCommunicator
             const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
             const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
             if( redirect )
-               setupRedirection( outputDirectory );
+               MPI::setupRedirection( outputDirectory );
 #ifdef HAVE_CUDA
             int size;
             MPI_Comm_size( MPI_COMM_WORLD, &size );
@@ -144,125 +131,32 @@ class MpiCommunicator
 
       static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
       {
-#ifdef HAVE_MPI
-         switch( required_thread_level ) {
-            case MPI_THREAD_SINGLE:
-            case MPI_THREAD_FUNNELED:
-            case MPI_THREAD_SERIALIZED:
-            case MPI_THREAD_MULTIPLE:
-               break;
-            default:
-               printf("ERROR: invalid argument for the 'required' thread level support: %d\n", required_thread_level);
-               MPI_Abort(MPI_COMM_WORLD, 1);
-         }
-
-         int provided;
-         MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
-         if( provided < required_thread_level ) {
-            const char* level = "";
-            switch( required_thread_level ) {
-               case MPI_THREAD_SINGLE:
-                  level = "MPI_THREAD_SINGLE";
-                  break;
-               case MPI_THREAD_FUNNELED:
-                  level = "MPI_THREAD_FUNNELED";
-                  break;
-               case MPI_THREAD_SERIALIZED:
-                  level = "MPI_THREAD_SERIALIZED";
-                  break;
-               case MPI_THREAD_MULTIPLE:
-                  level = "MPI_THREAD_MULTIPLE";
-                  break;
-            }
-            printf("ERROR: The MPI library does not have the required level of thread support: %s\n", level);
-            MPI_Abort(MPI_COMM_WORLD, 1);
-         }
-
-         selectGPU();
-#endif
+         MPI::Init( argc, argv, required_thread_level );
 
          // silence warnings about (potentially) unused variables
          (void) NullGroup;
-         (void) NullRequest;
-      }
-
-      static void setupRedirection( std::string outputDirectory )
-      {
-#ifdef HAVE_MPI
-         if(isDistributed() )
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank(AllGroup)) + ".txt";
-               const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank(AllGroup)) + ".txt";
-               std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
-               Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
-            }
-         }
-#endif
       }
 
       static void Finalize()
       {
-#ifdef HAVE_MPI
-         if(isDistributed())
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               // restore redirection (not necessary, it uses RAII internally...)
-               Debugging::redirect_stdout_stderr( "", "", true );
-            }
-         }
-         MPI_Finalize();
-#endif
+         MPI::Finalize();
       }
 
       static bool IsInitialized()
       {
-#ifdef HAVE_MPI
-         int initialized, finalized;
-         MPI_Initialized(&initialized);
-         MPI_Finalized(&finalized);
-         return initialized && !finalized;
-#else
-         return true;
-#endif
+         return MPI::isInitialized();
       }
 
       static int GetRank(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetRank cannot be called with NullGroup");
-         int rank;
-         MPI_Comm_rank(group,&rank);
-         return rank;
-#else
-         return 0;
-#endif
+         return MPI::GetRank( group );
       }
 
       static int GetSize(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetSize cannot be called with NullGroup");
-         int size;
-         MPI_Comm_size(group,&size);
-         return size;
-#else
-         return 1;
-#endif
+         return MPI::GetSize( group );
       }
 
-#ifdef HAVE_MPI
-      template< typename T >
-      static MPI_Datatype getDataType( const T& t )
-      {
-         return MPITypeResolver< T >::getType();
-      }
-#endif
-
       //dim-number of dimensions, distr array of guess distr - 0 for computation
       //distr array will be filled by computed distribution
       //more information in MPI documentation
@@ -291,78 +185,42 @@ class MpiCommunicator
 
       static void Barrier( CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Barrier cannot be called with NullGroup");
-         MPI_Barrier(group);
-#endif
+         MPI::Barrier( group );
       }
 
       template <typename T>
       static void Send( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup");
-         MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group );
-#endif
+         MPI::Send( data, count, dest, tag, group );
       }
 
       template <typename T>
       static void Recv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status );
-#endif
-     }
+         MPI::Recv( data, count, src, tag, group );
+      }
 
       template <typename T>
       static Request ISend( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "ISend cannot be called with NullGroup");
-         Request req;
-         MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req);
-         return req;
-#else
-         return 1;
-#endif
+         return MPI::Isend( data, count, dest, tag, group );
       }
 
       template <typename T>
       static Request IRecv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "IRecv cannot be called with NullGroup");
-         Request req;
-         MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req);
-         return req;
-#else
-         return 1;
-#endif
+         return MPI::Irecv( data, count, src, tag, group );
       }
 
       static void WaitAll(Request *reqs, int length)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE);
-#endif
+         MPI::Waitall( reqs, length );
       }
 
       template< typename T >
       static void Bcast( T* data, int count, int root, CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
-         MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group);
-#endif
+         MPI::Bcast( data, count, root, group );
       }
 
       template< typename T >
@@ -372,12 +230,7 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group);
-#else
-         memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
-#endif
+         MPI::Allreduce( data, reduced_data, count, op, group );
       }
 
       // in-place variant of Allreduce
@@ -387,27 +240,18 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group);
-#endif
+         MPI::Allreduce( data, count, op, group );
       }
 
-
       template< typename T >
       static void Reduce( const T* data,
                           T* reduced_data,
                           int count,
-                          MPI_Op &op,
+                          const MPI_Op &op,
                           int root,
                           CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup");
-         MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group);
-#else
-         memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) );
-#endif
+         MPI::Reduce( data, reduced_data, count, op, root, group );
       }
 
       template< typename T >
@@ -421,24 +265,7 @@ class MpiCommunicator
                                int receiveTag,
                                CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Sendrecv( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       destination,
-                       sendTag,
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       source,
-                       receiveTag,
-                       group,
-                       &status );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Sendrecv( sendData, sendCount, destination, sendTag, receiveData, receiveCount, source, receiveTag, group );
       }
 
       template< typename T >
@@ -448,19 +275,7 @@ class MpiCommunicator
                             int receiveCount,
                             CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Alltoall( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       group );
-#else
-         TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
-         memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) );
-#endif
+         MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
 
@@ -485,58 +300,16 @@ class MpiCommunicator
       }
 
 #ifdef HAVE_MPI
-      static MPI_Request NullRequest;
       static MPI_Comm AllGroup;
       static MPI_Comm NullGroup;
 #else
-      static constexpr int NullRequest = -1;
       static constexpr int AllGroup = 1;
       static constexpr int NullGroup = 0;
 #endif
    private:
-
-      static void selectGPU(void)
-      {
-#ifdef HAVE_MPI
-    #ifdef HAVE_CUDA
-         const int count = GetSize(AllGroup);
-         const int rank = GetRank(AllGroup);
-         int gpuCount;
-         cudaGetDeviceCount(&gpuCount);
-
-         procName names[count];
-
-         int i=0;
-         int len;
-         MPI_Get_processor_name(names[rank].name, &len);
-
-         for(i=0;i<count;i++)
-            std::memcpy(names[i].name,names[rank].name,len+1);
-
-         MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-            (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-                     MPI_COMM_WORLD);
-
-         int nodeRank=0;
-         for(i=0;i<rank;i++)
-         {
-            if(std::strcmp(names[rank].name,names[i].name)==0)
-               nodeRank++;
-         }
-
-         const int gpuNumber = nodeRank % gpuCount;
-
-         cudaSetDevice(gpuNumber);
-         TNL_CHECK_CUDA_DEVICE;
-
-         //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
-    #endif
-#endif
-      }
 };
 
 #ifdef HAVE_MPI
-MPI_Request MpiCommunicator::NullRequest = MPI_REQUEST_NULL;
 MPI_Comm MpiCommunicator::AllGroup = MPI_COMM_WORLD;
 MPI_Comm MpiCommunicator::NullGroup = MPI_COMM_NULL;
 #endif
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index cd0eb49d5e..61dc3eda08 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -15,7 +15,6 @@
 #include "DistributedArray.h"
 
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Communicators/MpiDefs.h>  // important only when MPI is disabled
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h
index 1cef0873dd..2695ccccc3 100644
--- a/src/TNL/Containers/Expressions/DistributedComparison.h
+++ b/src/TNL/Containers/Expressions/DistributedComparison.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/DummyDefs.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index b525e8a539..f55ae3d4ae 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/VerticalOperations.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/DummyDefs.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h
new file mode 100644
index 0000000000..b1b7dd6988
--- /dev/null
+++ b/src/TNL/MPI.h
@@ -0,0 +1,29 @@
+/***************************************************************************
+                          MPI.h  -  description
+                             -------------------
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+/**
+ * \brief A convenient header file which includes all headers from the
+ * `TNL/MPI/` subdirectory.
+ *
+ * Users may use this to avoid having to include many header files in their
+ * projects. On the other hand, parts of the TNL library should generally
+ * include only the specific headers they need, in order to avoid cycles in
+ * the header inclusion.
+ */
+
+#include "MPI/DummyDefs.h"
+#include "MPI/getDataType.h"
+#include "MPI/selectGPU.h"
+#include "MPI/Wrappers.h"
+#include "MPI/Utils.h"
+#include "MPI/ScopedInitializer.h"
+#include "MPI/Print.h"
diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/MPI/DummyDefs.h
similarity index 64%
rename from src/TNL/Communicators/MpiDefs.h
rename to src/TNL/MPI/DummyDefs.h
index df43005ec2..cdd5ea4838 100644
--- a/src/TNL/Communicators/MpiDefs.h
+++ b/src/TNL/MPI/DummyDefs.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          MpiCommunicator.h  -  description
+                          MPI/DummyDefs.h  -  description
                              -------------------
-    begin                : 2005/04/23
-    copyright            : (C) 2005 by Tomas Oberhuber
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -11,6 +11,9 @@
 #pragma once
 
 #ifndef HAVE_MPI
+using MPI_Request = int;
+using MPI_Comm = int;
+
 enum MPI_Op {
    MPI_MAX,
    MPI_MIN,
@@ -28,9 +31,9 @@ enum MPI_Op {
 
 // MPI_Init_thread constants
 enum {
-  MPI_THREAD_SINGLE,
-  MPI_THREAD_FUNNELED,
-  MPI_THREAD_SERIALIZED,
-  MPI_THREAD_MULTIPLE
+   MPI_THREAD_SINGLE,
+   MPI_THREAD_FUNNELED,
+   MPI_THREAD_SERIALIZED,
+   MPI_THREAD_MULTIPLE
 };
 #endif
diff --git a/src/TNL/Communicators/MPIPrint.h b/src/TNL/MPI/Print.h
similarity index 75%
rename from src/TNL/Communicators/MPIPrint.h
rename to src/TNL/MPI/Print.h
index 6d78eafaf8..5cd4819a29 100644
--- a/src/TNL/Communicators/MPIPrint.h
+++ b/src/TNL/MPI/Print.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          MPIPrint.h  -  description
+                          MPI/Print.h  -  description
                              -------------------
     begin                : Feb 7, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -10,34 +10,35 @@
 
 #pragma once
 
+#include <iostream>
 #include <sstream>
-#include <TNL/Communicators/MpiCommunicator.h>
+
+#include <TNL/String.h>
+#include <TNL/MPI/Wrappers.h>
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT( message )                                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       std::stringstream __tnl_mpi_print_stream_;                                                                                 \
-      __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                             \
-         << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                     \
+      __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                        \
+                              << message << std::endl;                                                                           \
       TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                      \
       mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                                  \
    }                                                                                                                             \
    else                                                                                                                          \
    {                                                                                                                             \
-      std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;              \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
-         {                                                                                                                       \
-            TNL::String __tnl_mpi_print_string_;                                                                                 \
-            mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                         \
-            std::cerr << __tnl_mpi_print_string_;                                                                                \
-         }                                                                                                                       \
+      std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                         \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
+      {                                                                                                                          \
+         TNL::String __tnl_mpi_print_string_;                                                                                    \
+         mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                            \
+         std::cerr << __tnl_mpi_print_string_;                                                                                   \
+      }                                                                                                                          \
    }                                                                                                                             \
 }
 #else
@@ -47,11 +48,11 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_MASTER( message )                                                                                          \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() == 0 )                                                                     \
+   if( TNL::MPI::GetRank() == 0 )                                                                     \
    {                                                                                                                             \
       std::cerr << "Master node : " << message << std::endl;                                                                     \
    }                                                                                                                             \
@@ -63,20 +64,20 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_COND( condition, message )                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
 {                                                                                                                                \
    if( condition) std::cerr << message << std::endl;                                                                             \
 }                                                                                                                                \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       int __tnl_mpi_print_cnd = ( condition );                                                                                   \
-      TNL::Communicators::MpiCommunicator::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                \
+      TNL::MPI::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                                           \
       if( condition ) {                                                                                                          \
          std::stringstream __tnl_mpi_print_stream_;                                                                              \
-         __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                          \
-            << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                  \
+         __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                     \
+                                 << message << std::endl;                                                                        \
          TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                   \
          mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                               \
       }                                                                                                                          \
@@ -84,13 +85,11 @@ else
    else                                                                                                                          \
    {                                                                                                                             \
       if( condition )                                                                                                            \
-         std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;           \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
+         std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                      \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
          {                                                                                                                       \
             int __tnl_mpi_print_cond;                                                                                            \
-            TNL::Communicators::MpiCommunicator::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                         \
+            TNL::MPI::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                                                    \
             if( __tnl_mpi_print_cond )                                                                                           \
             {                                                                                                                    \
                TNL::String __tnl_mpi_print_string_;                                                                              \
diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h
similarity index 72%
rename from src/TNL/Communicators/ScopedInitializer.h
rename to src/TNL/MPI/ScopedInitializer.h
index 2970bc6283..82ba02bc57 100644
--- a/src/TNL/Communicators/ScopedInitializer.h
+++ b/src/TNL/MPI/ScopedInitializer.h
@@ -12,22 +12,25 @@
 
 #pragma once
 
+#include "Wrappers.h"
+#include "Utils.h"
+
 namespace TNL {
-namespace Communicators {
+namespace MPI {
 
-template< typename Communicator >
 struct ScopedInitializer
 {
-   ScopedInitializer( int& argc, char**& argv )
+   ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
    {
-      Communicator::Init( argc, argv );
+      Init( argc, argv );
    }
 
    ~ScopedInitializer()
    {
-      Communicator::Finalize();
+      restoreRedirection();
+      Finalize();
    }
 };
 
-} // namespace Communicators
+} // namespace MPI
 } // namespace TNL
diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h
new file mode 100644
index 0000000000..b655aefd00
--- /dev/null
+++ b/src/TNL/MPI/Utils.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Debugging/OutputRedirection.h>
+
+#include "Wrappers.h"
+
+namespace TNL {
+namespace MPI {
+
+inline bool isInitialized()
+{
+   return Initialized() && ! Finalized();
+}
+
+inline void setupRedirection( std::string outputDirectory )
+{
+#ifdef HAVE_MPI
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank()) + ".txt";
+      const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank()) + ".txt";
+      std::cout << GetRank() << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
+      Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
+   }
+#endif
+}
+
+// restore redirection (usually not necessary, it uses RAII internally...)
+inline void restoreRedirection()
+{
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      Debugging::redirect_stdout_stderr( "", "", true );
+   }
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
new file mode 100644
index 0000000000..9a057da5f5
--- /dev/null
+++ b/src/TNL/MPI/Wrappers.h
@@ -0,0 +1,347 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#else
+   #include "DummyDefs.h"
+   #include <cstring>  // std::memcpy
+   #include <TNL/Exceptions/MPISupportMissing.h>
+#endif
+
+#include <TNL/Assert.h>
+#include "getDataType.h"
+#include "selectGPU.h"
+
+namespace TNL {
+namespace MPI {
+
+// function wrappers for MPI constants
+
+inline MPI_Comm AllGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_WORLD;
+#else
+   return 1;
+#endif
+}
+
+inline MPI_Comm NullGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_NULL;
+#else
+   return 0;
+#endif
+}
+
+inline MPI_Request NullRequest()
+{
+#ifdef HAVE_MPI
+   return MPI_REQUEST_NULL;
+#else
+   return 0;
+#endif
+}
+
+// wrappers for basic MPI functions
+
+inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
+{
+#ifdef HAVE_MPI
+   switch( required_thread_level ) {
+      case MPI_THREAD_SINGLE:
+      case MPI_THREAD_FUNNELED:
+      case MPI_THREAD_SERIALIZED:
+      case MPI_THREAD_MULTIPLE:
+         break;
+      default:
+         std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl;
+         MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   int provided;
+   MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
+   if( provided < required_thread_level ) {
+      const char* level = "";
+      switch( required_thread_level ) {
+         case MPI_THREAD_SINGLE:
+            level = "MPI_THREAD_SINGLE";
+            break;
+         case MPI_THREAD_FUNNELED:
+            level = "MPI_THREAD_FUNNELED";
+            break;
+         case MPI_THREAD_SERIALIZED:
+            level = "MPI_THREAD_SERIALIZED";
+            break;
+         case MPI_THREAD_MULTIPLE:
+            level = "MPI_THREAD_MULTIPLE";
+            break;
+      }
+      std::cerr << "ERROR: The MPI library does not have the required level of thread support: " << level << std::endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   selectGPU();
+#endif
+}
+
+inline void Finalize()
+{
+#ifdef HAVE_MPI
+   MPI_Finalize();
+#endif
+}
+
+inline bool Initialized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Initialized(&flag);
+    return flag;
+#else
+    return true;
+#endif
+}
+
+inline bool Finalized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Finalized(&flag);
+    return flag;
+#else
+    return false;
+#endif
+}
+
+inline int GetRank( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetRank cannot be called with NullGroup" );
+   int rank;
+   MPI_Comm_rank( group, &rank );
+   return rank;
+#else
+   return 0;
+#endif
+}
+
+inline int GetSize( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetSize cannot be called with NullGroup" );
+   int size;
+   MPI_Comm_size( group, &size );
+   return size;
+#else
+   return 1;
+#endif
+}
+
+// wrappers for MPI communication functions
+
+inline void Barrier( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Barrier cannot be called with NullGroup" );
+   MPI_Barrier(group);
+#endif
+}
+
+inline void Waitall( MPI_Request* reqs, int length )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   MPI_Waitall( length, reqs, MPI_STATUSES_IGNORE );
+#endif
+}
+
+template< typename T >
+void Send( const T* data,
+           int count,
+           int dest,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Send cannot be called with NullGroup" );
+   MPI_Send( (const void*) data, count, getDataType<T>(), dest, tag, group );
+#endif
+}
+
+template< typename T >
+void Recv( T* data,
+           int count,
+           int src,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Recv cannot be called with NullGroup" );
+   MPI_Recv( (void*) data, count, getDataType<T>(), src, tag, group, MPI_STATUS_IGNORE );
+#endif
+}
+
+template< typename T >
+void Sendrecv( const T* sendData,
+               int sendCount,
+               int destination,
+               int sendTag,
+               T* receiveData,
+               int receiveCount,
+               int source,
+               int receiveTag,
+               MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Sendrecv cannot be called with NullGroup" );
+   MPI_Sendrecv( (void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 destination,
+                 sendTag,
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 source,
+                 receiveTag,
+                 group,
+                 MPI_STATUS_IGNORE );
+#else
+   throw Exceptions::MPISupportMissing();
+#endif
+}
+
+template< typename T >
+MPI_Request Isend( const T* data,
+                   int count,
+                   int dest,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Isend cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Isend( (const void*) data, count, getDataType<T>(), dest, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+MPI_Request Irecv( T* data,
+                   int count,
+                   int src,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Irecv cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Irecv( (void*) data, count, getDataType<T>(), src, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+void Allreduce( const T* data,
+                T* reduced_data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, group );
+#else
+   std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) );
+#endif
+}
+
+// in-place variant of Allreduce
+template< typename T >
+void Allreduce( T* data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType<T>(), op, group );
+#endif
+}
+
+template< typename T >
+void Reduce( const T* data,
+             T* reduced_data,
+             int count,
+             const MPI_Op& op,
+             int root,
+             MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Reduce cannot be called with NullGroup" );
+   MPI_Reduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, root, group );
+#else
+   std::memcpy( (void*) reduced_data, (void*) data, count * sizeof(T) );
+#endif
+}
+
+template< typename T >
+void Bcast( T* data, int count, int root, MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Bcast cannot be called with NullGroup" );
+   MPI_Bcast( (void*) data, count, getDataType<T>(), root, group );
+#endif
+}
+
+template< typename T >
+void Alltoall( const T* sendData,
+               int sendCount,
+               T* receiveData,
+               int receiveCount,
+               MPI_Comm group )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Alltoall cannot be called with NullGroup" );
+   MPI_Alltoall( (const void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 group );
+#else
+   TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
+   std::memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof(T) );
+#endif
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/getDataType.h b/src/TNL/MPI/getDataType.h
new file mode 100644
index 0000000000..f3570679bf
--- /dev/null
+++ b/src/TNL/MPI/getDataType.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          getDataType.h  -  description
+                             -------------------
+    begin                : Feb 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#endif
+
+namespace TNL {
+namespace MPI {
+
+#ifdef HAVE_MPI
+template< typename T >
+struct TypeResolver
+{
+   static inline MPI_Datatype getType()
+   {
+      static_assert( sizeof(T) == sizeof(char) ||
+                     sizeof(T) == sizeof(int) ||
+                     sizeof(T) == sizeof(short int) ||
+                     sizeof(T) == sizeof(long int),
+                     "Fatal Error - Unknown MPI Type");
+      switch( sizeof(T) )
+      {
+         case sizeof(char):
+            return MPI_CHAR;
+         case sizeof(int):
+            return MPI_INT;
+         case sizeof(short int):
+            return MPI_SHORT;
+         case sizeof(long int):
+            return MPI_LONG;
+      }
+      // This will never happen thanks to the static_assert above, but icpc is
+      // not that smart and complains about missing return statement at the end
+      // of non-void function.
+      throw 0;
+   }
+};
+
+template<> struct TypeResolver< char >
+{
+   static inline MPI_Datatype getType(){return MPI_CHAR;};
+};
+
+template<> struct TypeResolver< int >
+{
+   static inline MPI_Datatype getType(){return MPI_INT;};
+};
+
+template<> struct TypeResolver< short int >
+{
+   static inline MPI_Datatype getType(){return MPI_SHORT;};
+};
+
+template<> struct TypeResolver< long int >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG;};
+};
+
+template<> struct TypeResolver< unsigned char >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
+};
+
+template<> struct TypeResolver< unsigned short int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
+};
+
+template<> struct TypeResolver< unsigned int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
+};
+
+template<> struct TypeResolver< unsigned long int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
+};
+
+template<> struct TypeResolver< float >
+{
+   static inline MPI_Datatype getType(){return MPI_FLOAT;};
+};
+
+template<> struct TypeResolver< double >
+{
+   static inline MPI_Datatype getType(){return MPI_DOUBLE;};
+};
+
+template<> struct TypeResolver< long double >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
+};
+
+template<> struct TypeResolver< bool >
+{
+   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
+   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
+   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
+};
+
+template< typename T >
+MPI_Datatype getDataType( const T& = T{} )
+{
+   return TypeResolver< T >::getType();
+}
+#endif
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h
new file mode 100644
index 0000000000..def9a329f9
--- /dev/null
+++ b/src/TNL/MPI/selectGPU.h
@@ -0,0 +1,72 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <cstring>
+
+#include <TNL/Cuda/CheckDevice.h>
+
+namespace TNL {
+namespace MPI {
+namespace {
+
+#ifdef HAVE_MPI
+#ifdef HAVE_CUDA
+   typedef struct __attribute__((__packed__)) {
+      char name[MPI_MAX_PROCESSOR_NAME];
+   } procName;
+#endif
+#endif
+
+inline void selectGPU()
+{
+#ifdef HAVE_MPI
+#ifdef HAVE_CUDA
+   int size;
+   MPI_Comm_size( MPI_COMM_WORLD, &size );
+   int rank;
+   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+   int gpuCount;
+   cudaGetDeviceCount( &gpuCount );
+
+   procName names[size];
+
+   int i=0;
+   int len;
+   MPI_Get_processor_name(names[rank].name, &len);
+
+   for(i=0;i<size;i++)
+      std::memcpy(names[i].name,names[rank].name,len+1);
+
+   MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
+      (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
+               MPI_COMM_WORLD);
+
+   int nodeRank=0;
+   for(i=0;i<rank;i++)
+   {
+      if(std::strcmp(names[rank].name,names[i].name)==0)
+         nodeRank++;
+   }
+
+   const int gpuNumber = nodeRank % gpuCount;
+
+   cudaSetDevice(gpuNumber);
+   TNL_CHECK_CUDA_DEVICE;
+
+   //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
+#endif
+#endif
+}
+
+} // namespace <unnamed>
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
index 6030b976f0..04647cb4af 100644
--- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -12,7 +12,6 @@
 
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Communicators/MPIPrint.h>
 
 namespace TNL {
 namespace Meshes {
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
index 60605c6eb0..99f505bba7 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -12,6 +12,7 @@
 
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/MPI/getDataType.h>
 
 namespace TNL {
 namespace Meshes {
@@ -19,7 +20,7 @@ namespace DistributedMeshes {
 
 
 /*
- * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. 
+ * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap.
  * It is slow and has high RAM consumption
  */
 template< typename MeshFunction,
@@ -88,8 +89,8 @@ class DistributedGridIO<
          return true;
 
       };
-            
-    static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+
+    static bool load(const String& fileName,MeshFunctionType &meshFunction)
     {
         auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
         if(distrGrid==NULL) //not distributed
@@ -99,10 +100,10 @@ class DistributedGridIO<
         }
 
         const MeshType& mesh=meshFunction.getMesh();
-        
+
         PointType spaceSteps=mesh.getSpaceSteps();
         PointType origin=mesh.getOrigin();
-                
+
         CoordinatesType localSize=distrGrid->getLocalSize();
         CoordinatesType localBegin=distrGrid->getLocalBegin();
 
@@ -111,33 +112,33 @@ class DistributedGridIO<
         newMesh->setSpaceSteps(spaceSteps);
         CoordinatesType newOrigin;
         newMesh->setOrigin(origin+spaceSteps*localBegin);
-        
+
         VectorType newDof(newMesh-> template getEntitiesCount< typename MeshType::Cell >());
         MeshFunctionType newMeshFunction;
-        newMeshFunction.bind(newMesh,newDof); 
+        newMeshFunction.bind(newMesh,newDof);
 
         CoordinatesType zeroCoord;
-        zeroCoord.setValue(0);        
+        zeroCoord.setValue(0);
 
         File file;
         file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), std::ios_base::in );
         newMeshFunction.boundLoad(file);
         file.close();
         CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
-        
+
         return true;
     };
-    
+
 };
 
 /*
- * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, 
+ * Save distributed data into single file without overlaps using MPIIO and MPI datatypes,
  * EXPLOSIVE: works with only Grids and MPI
  * BAD IMPLEMENTTION creating MPI-Types at every save! -- I dont want contamine more places by MPI..
  */
 
 #ifdef HAVE_MPI
-template<typename MeshFunctionType> 
+template<typename MeshFunctionType>
 class DistributedGridIO_MPIIOBase
 {
    public:
@@ -152,7 +153,7 @@ class DistributedGridIO_MPIIOBase
     static bool save(const String& fileName, MeshFunctionType &meshFunction, RealType *data)
     {
 		auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
-        
+
         if(distrGrid==NULL) //not distributed
         {
             meshFunction.save(fileName);
@@ -168,7 +169,7 @@ class DistributedGridIO_MPIIOBase
                       &file);
       if( ok != 0 )
          throw std::runtime_error("Open file falied");
-      
+
 		int written=save(file,meshFunction, data,0);
 
         MPI_File_close(&file);
@@ -176,7 +177,7 @@ class DistributedGridIO_MPIIOBase
 		return written>0;
 
 	};
-    
+
     static int save(MPI_File &file, MeshFunctionType &meshFunction, RealType *data, int offset)
     {
 
@@ -187,7 +188,7 @@ class DistributedGridIO_MPIIOBase
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
 
        int headerSize;
-	   
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
        if(Communicators::MpiCommunicator::GetRank(group)==0)
@@ -200,9 +201,9 @@ class DistributedGridIO_MPIIOBase
 	   offset +=headerSize;
 
        MPI_File_set_view(file,offset,
-               Communicators::MPITypeResolver<RealType>::getType(),
+               TNL::MPI::getDataType<RealType>(),
                ftype,"native",MPI_INFO_NULL);
-       
+
        MPI_Status wstatus;
 
        MPI_File_write(file,data,1,atype,&wstatus);
@@ -222,7 +223,7 @@ class DistributedGridIO_MPIIOBase
         int fstarts[dim];
         int flsize[dim];
         int fgsize[dim];
-        
+
         hackArray(dim,fstarts,distrGrid->getGlobalBegin().getData());
         hackArray(dim,flsize,distrGrid->getLocalSize().getData());
         hackArray(dim,fgsize,distrGrid->getGlobalSize().getData());
@@ -230,14 +231,14 @@ class DistributedGridIO_MPIIOBase
         MPI_Type_create_subarray(dim,
             fgsize,flsize,fstarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype);
 
         MPI_Type_commit(ftype);
 
        int agsize[dim];
        int alsize[dim];
-       int astarts[dim]; 
+       int astarts[dim];
 
        hackArray(dim,astarts,distrGrid->getLocalBegin().getData());
        hackArray(dim,alsize,distrGrid->getLocalSize().getData());
@@ -246,7 +247,7 @@ class DistributedGridIO_MPIIOBase
        MPI_Type_create_subarray(dim,
             agsize,alsize,astarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             atype);
        MPI_Type_commit(atype);
 
@@ -350,9 +351,9 @@ class DistributedGridIO_MPIIOBase
       MPI_File_close(&file);
       return ret;
    }
-            
+
     /* Funky bomb - no checks - only dirty load */
-    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) 
+    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset )
     {
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
 
@@ -360,7 +361,7 @@ class DistributedGridIO_MPIIOBase
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
-       
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
        int headerSize=0;
@@ -371,18 +372,18 @@ class DistributedGridIO_MPIIOBase
             headerSize=readMeshFunctionHeader(file,meshFunction,dataCount);
        }
        MPI_Bcast(&headerSize, 1, MPI_INT,0, group);
-       
+
        if(headerSize<0)
             return false;
 
        offset+=headerSize;
 
        MPI_File_set_view(file,offset,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype,"native",MPI_INFO_NULL);
        MPI_Status wstatus;
        MPI_File_read(file,(void*)data,1,atype,&wstatus);
-        
+
        MPI_Type_free(&atype);
        MPI_Type_free(&ftype);
 
@@ -412,7 +413,7 @@ class DistributedGridIO_MPIIOBase
         size+=count*sizeof(char);
         MPI_File_read(file, (void *)&count,1, MPI_INT, &rstatus);//DATACOUNT
         size+=1*sizeof(int);
-        
+
         if(count!=length)
         {
             std::cerr<<"Chyba načítání MeshFunction, délka dat v souboru neodpovídá očekávané délce" << std::endl;
@@ -421,7 +422,7 @@ class DistributedGridIO_MPIIOBase
 
         return size;
     };
-    
+
 };
 #endif
 
@@ -444,10 +445,10 @@ class DistributedGridIO<
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector=meshFunction.getData();
-            typename MeshFunctionType::RealType * data=hostVector.getData();  
+            typename MeshFunctionType::RealType * data=hostVector.getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
@@ -455,12 +456,12 @@ class DistributedGridIO<
          return false;
       };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector.setLike(meshFunction.getData());
             auto* data=hostVector.getData();
@@ -501,7 +502,7 @@ class DistributedGridIO<
          return false;
     };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
          if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
index 5a11502403..7bc17f9204 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
@@ -16,7 +16,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h>
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Pointers/SharedPointer.h>
 
 namespace TNL {
diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h
index 9182c620fe..5c35c7c338 100644
--- a/src/TNL/Solvers/Solver_impl.h
+++ b/src/TNL/Solvers/Solver_impl.h
@@ -16,11 +16,11 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 namespace TNL {
 namespace Solvers {
-   
+
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename MeshConfig, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
           template< typename MeshConfig > class ProblemConfig,
           typename MeshConfig >
@@ -37,7 +37,7 @@ run( int argc, char* argv[] )
    Devices::Cuda::configSetup( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv );
+   TNL::MPI::ScopedInitializer mpi( argc, argv );
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return false;
diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp
index c33ae82943..a2d4f48e9a 100644
--- a/src/Tools/tnl-game-of-life.cpp
+++ b/src/Tools/tnl-game-of-life.cpp
@@ -18,7 +18,7 @@
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 using namespace TNL;
 
@@ -361,7 +361,7 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index 1a7769b5c8..73765aafb9 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -16,7 +16,7 @@
 #include <TNL/Meshes/Grid.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 
 using namespace TNL;
@@ -55,7 +55,7 @@ int main( int argc, char* argv[] )
    setupConfig( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h
index 0be53242b8..1b8c59c75f 100644
--- a/src/Tools/tnl-test-distributed-mesh.h
+++ b/src/Tools/tnl-test-distributed-mesh.h
@@ -19,7 +19,7 @@
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 
 using namespace TNL;
 
@@ -431,7 +431,7 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 113d1daa3e..366535cc74 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index 145b0db5b4..aba9420f04 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index d80e467f5f..3c637de4d2 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index a072b2e806..93d6c3036b 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -10,7 +10,6 @@
 #include <gtest/gtest.h>
 
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index 7decaf5752..b778937b69 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -18,7 +18,6 @@
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Meshes/Readers/PVTUReader.h>
diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h
index 0f8f4b059a..4c89b60ba4 100644
--- a/src/UnitTests/main_mpi.h
+++ b/src/UnitTests/main_mpi.h
@@ -7,7 +7,7 @@
 
 #if (defined(HAVE_GTEST) && defined(HAVE_MPI))
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
 using CommunicatorType = TNL::Communicators::MpiCommunicator;
 
 #include <sstream>
@@ -58,7 +58,7 @@ int main( int argc, char* argv[] )
       delete listeners.Release(listeners.default_result_printer());
       listeners.Append(new MinimalistBufferedPrinter);
 
-      TNL::Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+      TNL::MPI::ScopedInitializer mpi(argc, argv);
    #endif
    return RUN_ALL_TESTS();
 #else
-- 
GitLab