diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05a0fd0b6849f69b166dfcb400ff705df68ee61f..b85842c1f14a9eca598427b6b7240d2f1cbba417 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,7 @@ if( ${WITH_CUDA} )
                set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} )
             endif()
         endif()
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda --default-stream per-thread)
         # disable false compiler warnings
         #   reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937
         #   incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg
diff --git a/Documentation/Pages/main-page.md b/Documentation/Pages/main-page.md
index db9aceccbbf7e887db7b76c8145204b8380f132a..5693f92a0df917185a705df474389bb554f08aa4 100644
--- a/Documentation/Pages/main-page.md
+++ b/Documentation/Pages/main-page.md
@@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries.
     - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for
       computations on Nvidia GPUs.
     - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can
-      use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for
-      distributed computing. For distributed CUDA computations, the library must
-      be [CUDA-aware](
+      a library implementing the MPI-3 standard for distributed computing (e.g.
+      [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations,
+      the library must be [CUDA-aware](
       https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/).
 
 - __Libraries:__
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 56a51cc2234f964866bdb7ae3d2f07e03851ebea..55b92ad81a27a198dcc6cb71d534b84664ed3d78 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,11 +2,10 @@
 
 ## Tutorials
 
-1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
-2. [General concepts](tutorial_GeneralConcepts.html)
-3. [Arrays](tutorial_Arrays.html)
-4. [Vectors](tutorial_Vectors.html)
-5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-6. [For loops](tutorial_ForLoops.html)
-7. [Cross-device pointers](tutorial_Pointers.html)
-8. [Matrices](tutorial_Matrices.html)
+1. [General concepts](tutorial_GeneralConcepts.html)
+2. [Arrays](tutorial_Arrays.html)
+3. [Vectors](tutorial_Vectors.html)
+4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+5. [For loops](tutorial_ForLoops.html)
+6. [Cross-device pointers](tutorial_Pointers.html)
+7. [Matrices](tutorial_Matrices.html)
diff --git a/src/3rdparty/CMakeLists.txt b/src/3rdparty/CMakeLists.txt
index 6dba288f0332c40c2e6eaf068e46af089519492e..01550de19eb337db7856628463ef7233763c4729 100644
--- a/src/3rdparty/CMakeLists.txt
+++ b/src/3rdparty/CMakeLists.txt
@@ -1,3 +1,9 @@
 install( DIRECTORY mpark Leksys TYPE INCLUDE
          MESSAGE_NEVER
          FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" )
+
+if( ${WITH_PYTHON} )
+   install( DIRECTORY cctbx TYPE INCLUDE
+            MESSAGE_NEVER
+            FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" )
+endif()
diff --git a/src/3rdparty/async/README.md b/src/3rdparty/async/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..36106864ab9f712faf762188a58104947ac5e4f5
--- /dev/null
+++ b/src/3rdparty/async/README.md
@@ -0,0 +1,532 @@
+# async
+Homepage: https://github.com/d36u9/async
+
+[[License(Boost Software License - Version 1.0)](http://www.boost.org/LICENSE_1_0.txt)]
+
+## Welcome
+async is a tiny C++ header-only high-performance library for async calls handled by a thread-pool, which is built on top of an unbounded MPMC lock-free queue.
+It's written in pure C++14 (C++11 support with preprocessor macros), no dependencies on other 3rd party libraries.
+
+Note: This library is originally designed for 64bit system. It has been tested on arch X86-64 and ARMV8(64bit), and ARMV7(32bit).
+
+## change logs
+* Jun. 2018:
+  * Added support for ARMV7 & V8
+  * Tested on Raspberry Pi 3 B+ with Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64)
+  * Tested on Raspberry Pi 3 B+ with Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l)
+  * Added Benchmark Results for Raspberry Pi 3 B+ ARMV8 (Linux Pi64 4.14.44-V8 AArch64)
+  * Added Benchmark Results for Raspberry Pi 3 B+ ARMV7 32bit (Linux 4.14.34-v7 armv7l)
+* Sept. 2017:
+  * Significantly improved the performance of async::queue without bulk operations.
+  * async::threadpool also benifits from this change.
+  * A bounded MPMC queue `async::bounded_queue` was added to the lib, which is pretty useful for memory constrainted system or some fixed-size message pipeline design. The overall performance of this buffer based `async::bounded_queue` is comparable to bulk operations of node-based `async::queue`. `async::bounded_queue` shares the almost identical interface as `async::queue`, except for bulk operations, and a size prarameter has to be passed to `bounded_queue`'s constructor, and also added blocking methods (`blocking_enqueue` & `blocking_dequeue`). `TRAIT::NOEXCEPT_CHECK` setting is also similar to `async::queue` to help handle exceptions that may be thrown in element's ctor.  `bounded_queue` is basically a C++ implementation of [PTLQueue](https://blogs.oracle.com/dave/ptlqueue-:-a-scalable-bounded-capacity-mpmc-queue) design (Please read Dave Dice's article for details and references).
+
+## Features
+* interchangeable with std::async, accepts all kinds of callable instances, like static functions, member functions, functors, lambdas
+* dynamically changeable thread-pool size at run-time
+* tasks are managed in a lock-free queue
+* provided lock-free queue doesn't have restricted limitation as boost::lockfree::queue
+* low-latency for the task execution thanks to underlying lock-free queue
+
+## Tested Platforms& Compilers
+(old versions of OSs or compilers may work, but not tested)
+* Windows 10 Visual Studio 2015+
+* Linux Ubuntu 16.04 gcc4.9.2+/clang 3.8+
+* MacOS Sierra 10.12.5 clang-802.0.42
+
+## Getting Started
+## Building the test& benchmark
+
+### C++11 compilers
+If your compiler only supports C++11, please edit CMakeLists.txt with the following change:
+```
+set(CMAKE_CXX_STANDARD 14)
+#change to
+set(CMAKE_CXX_STANDARD 11)
+```
+
+### Build& test with Microsoft C++ REST SDK
+If your OS is Windows or has cppresetsdk installed& configured on Linux or Mac, please edit CMakeLists.txt to enable PPL test:
+```
+option(WITH_CPPRESTSDK "Build Cpprestsdk Test" OFF)
+#to
+option(WITH_CPPRESTSDK "Build Cpprestsdk Test" ON)
+```
+
+
+### Build for Linux or Mac (x86-64 & ARMV7&V8)
+```
+#to use clang (linux) with following export command
+#EXPORT CC=clang-3.8
+#EXPORT CXX=clang++-3.8
+#run the following to set up release build, (for MasOS Xcode, you can remove -DCMAKE_BUILD_TYPE for now, and choose build type at build-time)
+cmake -H. -Bbuild -DCMAKE_BUILD_TYPE=RELEASE
+#now build the release
+cmake --build build --config Release
+#or debug
+cmake --build build --config Debug
+#or other builds
+cmake --build build --config RelWithDebInfo
+cmake --build build --config MinSizeRel
+```
+
+### Build for Windows (X86-64)
+```
+#for VS 2015
+cmake -H. -Bbuild -G "Visual Studio 14 2015 Win64"
+#or VS 2017
+cmake -H. -Bbuild -G "Visual Studio 15 2017 Win64"
+#build the release from command line or you can open the project file in Visual Studio, and build from there
+cmake --build build --config Release
+```
+
+## How to use it in your project/application
+simply copy all headers in async sub-folder to your project, and include those headers in your source code.
+
+## Thread Pool Indrodction
+### Thread Pool intializations
+
+```
+async::threadpool tp; //by default, thread pool size will be the same number of your hardware CPU core/threads
+async::threadpool tp(8); //create a thread pool with 8 threads
+async::threadpool tp(0); //create a thread pool with no threads available, it's in pause mode
+```
+
+### resize the thread pool
+```
+async::threadpool tp(32);
+...//some operations
+tp.configurepool(16);// can be called at anytime (as long as tp is still valid) to reset the pool size
+                     // no interurption for running tasks
+```
+### submit the task
+*static functions, member functions, functors, lambdas are all supported
+```
+int foo(int i) { return ++i; }
+auto pkg = tp.post(foo, i); //retuns a std::future
+pkg.get(); //will block
+```
+
+## multi-producer multi-consumer unbounded lock-free queue Indrodction
+The design: A simple and classic implementation. It's link-based 3-level depth nested container with local array for each level storage and simulated tagged pointer for linking.
+The size of each level, and tag bits can be configured through TRAITS (please see source for details).
+The queue with default traits seetings can store up to 1 Trillion elements/nodes (at least 1 Terabyte memory space).
+
+### element type requirements
+* nothrow destructible
+* optional (better to be true)
+  * nothrow constructible
+  * nothrow move-assignable
+
+NOTE: the exception thrown by constructor is acceptable. Although it'd be better to keep ctor noexcept if possible.
+noexcept detection is turned off by default, it can be turned on by setting  `TRAIT::NOEXCEPT_CHECK` to true.
+With `TRAIT::NOEXCEPT_CHECK` on(true), queue will enable exception handling if ctor or move assignment may throw exceptions.
+
+
+### queue intializations
+```
+async::queue<T> q; //default constructor, it's unbounded
+
+async::queue<T> q(1000); // pre-allocated 1000 storage nodes, the capcity will increase automatically after 1000 nodes are used
+```
+### usage
+```
+// enqueues a T constructed from args, supports the following constructions:
+// move, if args is a T rvalue
+// copy, if args is a T lvalue, or
+// emplacement if args is an initializer list that can be passed to a T constructor
+async::queue<T>::enqueue(Args... args)
+
+async::queue<T>::dequeue(T& data) //type T should have move assignment operator,
+//e.g.
+async::queue<int> q;
+q.enqueue(11);
+int i(0);
+q.dequeue(i);
+
+```
+### bulk operations
+It's convienent for bulk data, and also can boost the throughput.
+exception handling is not available in bulk operations even with `TRAIT::NOEXCEPT_CHECK` being true.
+bulk operations are suitable for plain data types, like network/event messages.
+
+```
+int a[] = {1,2,3,4,5};
+int b[5];
+q.bulk_enqueue(std::bengin(a), 5);
+auto popcount = q.bulk_dequeue(std::begin(b), 5); //popcount is the number of elemtnets sucessfully pulled from the queue.
+//or like the following code:
+std::vector<int> v;
+auto it = std::inserter(v, std::begin(v));
+popcount = q.bulk_dequeue(it, 5);
+```
+
+## Unit Test
+The unit test code provides most samples for usage.
+
+## Benchmark
+NOTE: the results may vary on different OS platforms and hardware.
+### thread pool benchmark
+The benchmark is a simple demonstration.
+NOTE: may require extra config, please see CMakeLists.txt for detailed settings
+The test benchamarks the following task/job based async implementation:
+* async::threadpool (this library)
+* std::async
+* boost::async
+* AsioThreadPool (my another implementation based on boost::asio, has very stable and good performance, especially on Windows with iocp)
+* Microsoft::PPL (pplx from [cpprestsdk](https://github.com/Microsoft/cpprestsdk) on Linux& MacOS or PPL on windows)
+
+
+e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64)
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 1130 ns  max: 1227 ns  min: 1066 ns avg_task_post: 1032 ns
+       *std::async (time/task) avg: 1469 ns  max: 1549 ns  min: 1423 ns avg_task_post: 1250 ns
+   *Microsoft::PPL (time/task) avg: 1148 ns  max: 1216 ns  min: 1114 ns avg_task_post: 1088 ns
+    AsioThreadPool (time/task) avg: 1166 ns  max: 1319 ns  min: 1013 ns avg_task_post: 1073 ns
+     *boost::async (time/task) avg: 29153 ns  max: 30028 ns  min: 27990 ns avg_task_post: 23343 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 439 ns  max: 557 ns  min: 398 ns avg_task_post: 356 ns
+       *std::async (time/task) avg: 800 ns  max: 890 ns  min: 759 ns avg_task_post: 629 ns
+   *Microsoft::PPL (time/task) avg: 666 ns  max: 701 ns  min: 640 ns avg_task_post: 605 ns
+    AsioThreadPool (time/task) avg: 448 ns  max: 541 ns  min: 389 ns avg_task_post: 365 ns
+     *boost::async (time/task) avg: 32419 ns  max: 33296 ns  min: 30105 ns avg_task_post: 25561 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 262 ns  max: 300 ns  min: 252 ns avg_task_post: 176 ns
+       *std::async (time/task) avg: 873 ns  max: 961 ns  min: 821 ns avg_task_post: 701 ns
+   *Microsoft::PPL (time/task) avg: 727 ns  max: 755 ns  min: 637 ns avg_task_post: 662 ns
+    AsioThreadPool (time/task) avg: 607 ns  max: 645 ns  min: 567 ns avg_task_post: 210 ns
+     *boost::async (time/task) avg: 33158 ns  max: 150331 ns  min: 28560 ns avg_task_post: 28655 ns
+```
+
+e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 1320 ns  max: 1357 ns  min: 1301 ns avg_task_post: 1266 ns
+       *std::async (time/task) avg: 11817 ns  max: 12469 ns  min: 11533 ns avg_task_post: 9580 ns
+   *Microsoft::PPL (time/task) avg: 1368 ns  max: 1498 ns  min: 1325 ns avg_task_post: 1349 ns
+    AsioThreadPool (time/task) avg: 1475 ns  max: 1499 ns  min: 1318 ns avg_task_post: 1332 ns
+     *boost::async (time/task) avg: 4574 ns  max: 4697 ns  min: 4450 ns avg_task_post: 4531 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 516 ns  max: 688 ns  min: 239 ns avg_task_post: 522 ns
+       *std::async (time/task) avg: 41630 ns  max: 44316 ns  min: 41334 ns avg_task_post: 38151 ns
+   *Microsoft::PPL (time/task) avg: 3652 ns  max: 3710 ns  min: 3598 ns avg_task_post: 3629 ns
+    AsioThreadPool (time/task) avg: 529 ns  max: 814 ns  min: 494 ns avg_task_post: 447 ns
+     *boost::async (time/task) avg: 14634 ns  max: 14669 ns  min: 14598 ns avg_task_post: 14583 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 398 ns  max: 468 ns  min: 337 ns avg_task_post: 177 ns
+       *std::async (time/task) avg: 44603 ns  max: 46904 ns  min: 44272 ns avg_task_post: 40877 ns
+   *Microsoft::PPL (time/task) avg: 3714 ns  max: 3816 ns  min: 3656 ns avg_task_post: 3690 ns
+    AsioThreadPool (time/task) avg: 564 ns  max: 605 ns  min: 533 ns avg_task_post: 253 ns
+     *boost::async (time/task) avg: 20421 ns  max: 21738 ns  min: 19105 ns avg_task_post: 20375 ns
+```
+
+e.g. MacOS 10.12.5 clang Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 (Microsoft::PPL(cpprestsdk::pplx) is superisingly good compared with other libraries on MacOS, not sure if it's due to some comipiler optimization)
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 8517 ns  max: 8641 ns  min: 7400 ns avg_task_post: 8393 ns
+       *std::async (time/task) avg: 13618 ns  max: 13845 ns  min: 13276 ns avg_task_post: 13476 ns
+   *Microsoft::PPL (time/task) avg: 747 ns  max: 938 ns  min: 626 ns avg_task_post: 718 ns
+    AsioThreadPool (time/task) avg: 8647 ns  max: 8807 ns  min: 8558 ns avg_task_post: 8524 ns
+     *boost::async (time/task) avg: 11732 ns  max: 12028 ns  min: 11526 ns avg_task_post: 11698 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 5964 ns  max: 6017 ns  min: 5790 ns avg_task_post: 5830 ns
+       *std::async (time/task) avg: 9690 ns  max: 10043 ns  min: 9132 ns avg_task_post: 9531 ns
+   *Microsoft::PPL (time/task) avg: 380 ns  max: 425 ns  min: 342 ns avg_task_post: 353 ns
+    AsioThreadPool (time/task) avg: 6173 ns  max: 6459 ns  min: 6116 ns avg_task_post: 6042 ns
+     *boost::async (time/task) avg: 8643 ns  max: 9470 ns  min: 8513 ns avg_task_post: 8591 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 3469 ns  max: 3527 ns  min: 3415 ns avg_task_post: 3339 ns
+       *std::async (time/task) avg: 10902 ns  max: 11164 ns  min: 10709 ns avg_task_post: 10738 ns
+   *Microsoft::PPL (time/task) avg: 367 ns  max: 426 ns  min: 326 ns avg_task_post: 323 ns
+    AsioThreadPool (time/task) avg: 3920 ns  max: 3975 ns  min: 3832 ns avg_task_post: 3409 ns
+     *boost::async (time/task) avg: 9800 ns  max: 10223 ns  min: 9196 ns avg_task_post: 9744 ns
+```
+
+e.g. Windows 7 64bit Intel i7-4790 16GB RAM Visual Studio 2015 Update 3
+```
+Benchmark Test Run: 1 Producers 7(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 809 ns  max: 924 ns  min: 687 ns avg_task_post: 774 ns
+       *std::async (time/task) avg: 1914 ns  max: 2032 ns  min: 1790 ns avg_task_post: 1877 ns
+   *Microsoft::PPL (time/task) avg: 1718 ns  max: 2181 ns  min: 1623 ns avg_task_post: 1677 ns
+    AsioThreadPool (time/task) avg: 1100 ns  max: 1137 ns  min: 1076 ns avg_task_post: 1065 ns
+     *boost::async (time/task) avg: 191532 ns  max: 203716 ns  min: 186114 ns avg_task_post: 191507 ns
+...
+Benchmark Test Run: 4 Producers 4(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 423 ns  max: 538 ns  min: 338 ns avg_task_post: 388 ns
+       *std::async (time/task) avg: 1249 ns  max: 1279 ns  min: 1233 ns avg_task_post: 1211 ns
+   *Microsoft::PPL (time/task) avg: 1229 ns  max: 1246 ns  min: 1208 ns avg_task_post: 1186 ns
+    AsioThreadPool (time/task) avg: 563 ns  max: 577 ns  min: 499 ns avg_task_post: 528 ns
+     *boost::async (time/task) avg: 95484 ns  max: 112569 ns  min: 93808 ns avg_task_post: 95458 ns
+...
+Benchmark Test Run: 7 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 267 ns  max: 323 ns  min: 255 ns avg_task_post: 232 ns
+       *std::async (time/task) avg: 1202 ns  max: 1257 ns  min: 1182 ns avg_task_post: 1009 ns
+   *Microsoft::PPL (time/task) avg: 1199 ns  max: 1262 ns  min: 1175 ns avg_task_post: 988 ns
+    AsioThreadPool (time/task) avg: 783 ns  max: 960 ns  min: 706 ns avg_task_post: 375 ns
+     *boost::async (time/task) avg: 103572 ns  max: 107041 ns  min: 101993 ns avg_task_post: 103542 ns
+```
+
+e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+
+```
+Benchmark Test Run: 1 Producers 3(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 7809 ns  max: 10467 ns  min: 7453 ns avg_task_post: 7261 ns
+       *std::async (time/task) avg: 139664 ns  max: 3453077 ns  min: 104589 ns avg_task_post: 117819 ns
+    AsioThreadPool (time/task) avg: 6545 ns  max: 8804 ns  min: 5678 ns avg_task_post: 5654 ns
+     *boost::async (time/task) avg: 37629 ns  max: 38978 ns  min: 36769 ns avg_task_post: 36933 ns
+
+Benchmark Test Run: 2 Producers 2(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 2207 ns  max: 4084 ns  min: 1809 ns avg_task_post: 1325 ns
+       *std::async (time/task) avg: 431781 ns  max: 17500817 ns  min: 91919 ns avg_task_post: 407595 ns
+    AsioThreadPool (time/task) avg: 2251 ns  max: 3351 ns  min: 1839 ns avg_task_post: 1405 ns
+     *boost::async (time/task) avg: 48456 ns  max: 50578 ns  min: 46698 ns avg_task_post: 47753 ns
+
+Benchmark Test Run: 3 Producers 1(* not applied) Consumers  with 21000 tasks and run 100 batches
+  async::threapool (time/task) avg: 3346 ns  max: 3974 ns  min: 2635 ns avg_task_post: 1017 ns
+       *std::async (time/task) avg: 110853 ns  max: 768224 ns  min: 103045 ns avg_task_post: 86361 ns
+    AsioThreadPool (time/task) avg: 3828 ns  max: 4209 ns  min: 3354 ns avg_task_post: 976 ns
+     *boost::async (time/task) avg: 59094 ns  max: 67042 ns  min: 54802 ns avg_task_post: 58365 ns
+```
+
+### queue benchmark
+The benchmark uses producers-consumers model, and doesn't provide all the detailed measurements.
+* async::bounded_queue
+* async::queue
+* boost::lockfree::queue
+* boost::lockfree::spsc_queue  (only for single-producer-single-consumer test)
+
+e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64)
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 18 ns  max: 55 ns  min: 17 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 50 ns  min: 23 ns
+          async::queue (time/op) avg: 28 ns  max: 66 ns  min: 27 ns
+boost::lockfree::queue (time/op) avg: 167 ns  max: 195 ns  min: 70 ns
+boost::lockfree::spsc_queue (time/op) avg: 10 ns  max: 38 ns  min: 8 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 27 ns  max: 62 ns  min: 25 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 124 ns  min: 24 ns
+          async::queue (time/op) avg: 42 ns  max: 115 ns  min: 29 ns
+boost::lockfree::queue (time/op) avg: 240 ns  max: 576 ns  min: 119 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 44 ns  max: 78 ns  min: 29 ns
+async::queue::bulk(16) (time/op) avg: 34 ns  max: 109 ns  min: 28 ns
+          async::queue (time/op) avg: 90 ns  max: 122 ns  min: 44 ns
+boost::lockfree::queue (time/op) avg: 213 ns  max: 227 ns  min: 161 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 53 ns  max: 82 ns  min: 27 ns
+async::queue::bulk(16) (time/op) avg: 34 ns  max: 107 ns  min: 29 ns
+          async::queue (time/op) avg: 100 ns  max: 114 ns  min: 51 ns
+boost::lockfree::queue (time/op) avg: 197 ns  max: 207 ns  min: 186 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 31 ns  max: 81 ns  min: 25 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 104 ns  min: 28 ns
+          async::queue (time/op) avg: 93 ns  max: 117 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 211 ns  max: 222 ns  min: 162 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 52 ns  max: 79 ns  min: 30 ns
+async::queue::bulk(16) (time/op) avg: 33 ns  max: 103 ns  min: 29 ns
+          async::queue (time/op) avg: 94 ns  max: 126 ns  min: 74 ns
+boost::lockfree::queue (time/op) avg: 199 ns  max: 217 ns  min: 174 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 49 ns  max: 81 ns  min: 35 ns
+async::queue::bulk(16) (time/op) avg: 33 ns  max: 60 ns  min: 28 ns
+          async::queue (time/op) avg: 97 ns  max: 134 ns  min: 51 ns
+boost::lockfree::queue (time/op) avg: 185 ns  max: 198 ns  min: 152 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 36 ns  max: 81 ns  min: 34 ns
+async::queue::bulk(16) (time/op) avg: 30 ns  max: 60 ns  min: 26 ns
+          async::queue (time/op) avg: 48 ns  max: 89 ns  min: 45 ns
+boost::lockfree::queue (time/op) avg: 161 ns  max: 179 ns  min: 120 ns
+```
+
+e.g. MacOS 10.12.5 Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42
+```
+SSingle Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 12 ns  max: 37 ns  min: 12 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 54 ns  min: 25 ns
+          async::queue (time/op) avg: 23 ns  max: 61 ns  min: 23 ns
+boost::lockfree::queue (time/op) avg: 156 ns  max: 172 ns  min: 118 ns
+boost::lockfree::spsc_queue (time/op) avg: 11 ns  max: 30 ns  min: 5 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 84 ns  max: 98 ns  min: 60 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 125 ns  min: 24 ns
+          async::queue (time/op) avg: 104 ns  max: 115 ns  min: 92 ns
+boost::lockfree::queue (time/op) avg: 231 ns  max: 326 ns  min: 213 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 82 ns  max: 100 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 36 ns  max: 108 ns  min: 31 ns
+          async::queue (time/op) avg: 102 ns  max: 122 ns  min: 90 ns
+boost::lockfree::queue (time/op) avg: 192 ns  max: 229 ns  min: 184 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 79 ns  max: 93 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 94 ns  min: 29 ns
+          async::queue (time/op) avg: 98 ns  max: 116 ns  min: 70 ns
+boost::lockfree::queue (time/op) avg: 189 ns  max: 198 ns  min: 175 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 77 ns  max: 146 ns  min: 56 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 92 ns  min: 26 ns
+          async::queue (time/op) avg: 93 ns  max: 167 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 200 ns  max: 218 ns  min: 182 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 76 ns  max: 92 ns  min: 48 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 89 ns  min: 24 ns
+          async::queue (time/op) avg: 97 ns  max: 140 ns  min: 83 ns
+boost::lockfree::queue (time/op) avg: 200 ns  max: 211 ns  min: 163 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 80 ns  max: 98 ns  min: 59 ns
+async::queue::bulk(16) (time/op) avg: 28 ns  max: 97 ns  min: 24 ns
+          async::queue (time/op) avg: 105 ns  max: 122 ns  min: 78 ns
+boost::lockfree::queue (time/op) avg: 182 ns  max: 194 ns  min: 153 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 86 ns  max: 103 ns  min: 64 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 82 ns  min: 23 ns
+          async::queue (time/op) avg: 107 ns  max: 127 ns  min: 91 ns
+boost::lockfree::queue (time/op) avg: 154 ns  max: 180 ns  min: 146 ns
+```
+
+e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 12 ns  max: 71 ns  min: 11 ns
+async::queue::bulk(16) (time/op) avg: 65 ns  max: 134 ns  min: 24 ns
+          async::queue (time/op) avg: 48 ns  max: 107 ns  min: 33 ns
+boost::lockfree::queue (time/op) avg: 179 ns  max: 198 ns  min: 60 ns
+boost::lockfree::spsc_queue (time/op) avg: 7 ns  max: 47 ns  min: 4 ns
+
+Benchmark Test Run: 1 Producers 7 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 68 ns  max: 505 ns  min: 35 ns
+async::queue::bulk(16) (time/op) avg: 29 ns  max: 135 ns  min: 25 ns
+          async::queue (time/op) avg: 93 ns  max: 138 ns  min: 73 ns
+boost::lockfree::queue (time/op) avg: 234 ns  max: 292 ns  min: 208 ns
+
+Benchmark Test Run: 2 Producers 6 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 68 ns  max: 106 ns  min: 39 ns
+async::queue::bulk(16) (time/op) avg: 35 ns  max: 117 ns  min: 19 ns
+          async::queue (time/op) avg: 92 ns  max: 135 ns  min: 79 ns
+boost::lockfree::queue (time/op) avg: 193 ns  max: 227 ns  min: 175 ns
+
+Benchmark Test Run: 3 Producers 5 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 73 ns  max: 251 ns  min: 49 ns
+async::queue::bulk(16) (time/op) avg: 31 ns  max: 110 ns  min: 26 ns
+          async::queue (time/op) avg: 96 ns  max: 178 ns  min: 70 ns
+boost::lockfree::queue (time/op) avg: 179 ns  max: 359 ns  min: 164 ns
+
+Benchmark Test Run: 4 Producers 4 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 81 ns  max: 220 ns  min: 61 ns
+async::queue::bulk(16) (time/op) avg: 27 ns  max: 114 ns  min: 23 ns
+          async::queue (time/op) avg: 102 ns  max: 159 ns  min: 74 ns
+boost::lockfree::queue (time/op) avg: 177 ns  max: 541 ns  min: 162 ns
+
+Benchmark Test Run: 5 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 83 ns  max: 443 ns  min: 53 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 297 ns  min: 23 ns
+          async::queue (time/op) avg: 110 ns  max: 512 ns  min: 79 ns
+boost::lockfree::queue (time/op) avg: 176 ns  max: 505 ns  min: 161 ns
+
+Benchmark Test Run: 6 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 83 ns  max: 437 ns  min: 36 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 261 ns  min: 23 ns
+          async::queue (time/op) avg: 112 ns  max: 449 ns  min: 84 ns
+boost::lockfree::queue (time/op) avg: 178 ns  max: 547 ns  min: 164 ns
+
+Benchmark Test Run: 7 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 90 ns  max: 805 ns  min: 28 ns
+async::queue::bulk(16) (time/op) avg: 26 ns  max: 78 ns  min: 21 ns
+          async::queue (time/op) avg: 123 ns  max: 695 ns  min: 80 ns
+boost::lockfree::queue (time/op) avg: 195 ns  max: 615 ns  min: 154 ns
+```
+
+e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 67 ns  max: 697 ns  min: 53 ns
+async::queue::bulk(16) (time/op) avg: 144 ns  max: 434 ns  min: 130 ns
+          async::queue (time/op) avg: 141 ns  max: 441 ns  min: 115 ns
+boost::lockfree::queue (time/op) avg: 182 ns  max: 514 ns  min: 168 ns
+boost::lockfree::spsc_queue (time/op) avg: 62 ns  max: 430 ns  min: 53 ns
+
+Benchmark Test Run: 1 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 72 ns  max: 574 ns  min: 59 ns
+async::queue::bulk(16) (time/op) avg: 141 ns  max: 515 ns  min: 116 ns
+          async::queue (time/op) avg: 181 ns  max: 590 ns  min: 134 ns
+boost::lockfree::queue (time/op) avg: 192 ns  max: 1045 ns  min: 172 ns
+
+Benchmark Test Run: 2 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 82 ns  max: 457 ns  min: 65 ns
+async::queue::bulk(16) (time/op) avg: 99 ns  max: 701 ns  min: 84 ns
+          async::queue (time/op) avg: 124 ns  max: 550 ns  min: 108 ns
+boost::lockfree::queue (time/op) avg: 151 ns  max: 847 ns  min: 138 ns
+
+Benchmark Test Run: 3 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 88 ns  max: 538 ns  min: 67 ns
+async::queue::bulk(16) (time/op) avg: 89 ns  max: 717 ns  min: 71 ns
+          async::queue (time/op) avg: 131 ns  max: 631 ns  min: 118 ns
+boost::lockfree::queue (time/op) avg: 165 ns  max: 644 ns  min: 149 ns
+```
+
+e.g. Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) gcc 6.3.0 on Raspberry Pi 3 B+
+```
+Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches
+Benchmark Test Run: 1 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 227 ns  max: 912 ns  min: 179 ns
+async::queue::bulk(16) (time/op) avg: 442 ns  max: 1236 ns  min: 365 ns
+          async::queue (time/op) avg: 423 ns  max: 1249 ns  min: 364 ns
+boost::lockfree::queue (time/op) avg: 474 ns  max: 1017 ns  min: 410 ns
+boost::lockfree::spsc_queue (time/op) avg: 70 ns  max: 761 ns  min: 48 ns
+
+Benchmark Test Run: 1 Producers 3 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 241 ns  max: 1482 ns  min: 187 ns
+async::queue::bulk(16) (time/op) avg: 470 ns  max: 1259 ns  min: 354 ns
+          async::queue (time/op) avg: 488 ns  max: 1482 ns  min: 375 ns
+boost::lockfree::queue (time/op) avg: 462 ns  max: 1158 ns  min: 427 ns
+
+
+Benchmark Test Run: 2 Producers 2 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 208 ns  max: 348 ns  min: 158 ns
+async::queue::bulk(16) (time/op) avg: 285 ns  max: 543 ns  min: 237 ns
+          async::queue (time/op) avg: 306 ns  max: 761 ns  min: 234 ns
+boost::lockfree::queue (time/op) avg: 334 ns  max: 1481 ns  min: 261 ns
+
+
+Benchmark Test Run: 3 Producers 1 Consumers  with 10000 Ops and run 1000 batches
+  async::bounded_queue (time/op) avg: 241 ns  max: 884 ns  min: 192 ns
+async::queue::bulk(16) (time/op) avg: 210 ns  max: 651 ns  min: 180 ns
+          async::queue (time/op) avg: 439 ns  max: 682 ns  min: 375 ns
+boost::lockfree::queue (time/op) avg: 420 ns  max: 903 ns  min: 320 ns
+```
+
+## coding style
+all code has been formated by clang-format. It may be more easy to read in text editor or may be not :)
+
+## Many Thanks to 3rd party and their developers
+* [Boost](http://www.boost.org/)
+* [Boost CMake](https://github.com/Orphis/boost-cmake) Easy Boost integration in CMake projects!
+* [Catch](https://github.com/philsquared/Catch) A powerful test framework for unit test.
+* [cpprestsdk](https://github.com/Microsoft/cpprestsdk) The C++ REST SDK is a Microsoft project for cloud-based client-server communication in native code using a modern asynchronous C++ API design.
+* [rlutil](https://github.com/tapio/rlutil) provides cross-platform console-mode functions to position and colorize text.
+* [sakaki](https://github.com/sakaki-/gentoo-on-rpi3-64bit) Bootable 64-bit Gentoo image for the Raspberry Pi 3 B / B+, with Linux 4.14
diff --git a/src/3rdparty/async/bounded_queue.h b/src/3rdparty/async/bounded_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..341e5f307498714657d37457042da26bf8f455cd
--- /dev/null
+++ b/src/3rdparty/async/bounded_queue.h
@@ -0,0 +1,342 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "utility.h"
+#include <atomic>
+#include <cassert>
+#include <limits>
+
+namespace async {
+
+struct bounded_traits {
+  static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
+  static constexpr std::size_t CachelineSize = 64;
+  static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
+  using sequence_type = std::uint64_t;
+};
+
+template <typename T, typename TRAITS = bounded_traits> class bounded_queue {
+private:
+  static_assert(std::is_nothrow_destructible<T>::value,
+                "T must be nothrow destructible");
+
+public:
+  static constexpr std::size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment;
+  using seq_t = typename TRAITS::sequence_type;
+  explicit bounded_queue(std::size_t size)
+      : fastmodulo((size > 0 && ((size & (size - 1)) == 0))),
+        bitshift(fastmodulo ? getShiftBitsCount(size) : 0),
+        elements(new element[size]), mask(fastmodulo ? size - 1 : 0),
+        qsize(size), enqueueIx(0), dequeueIx(0) {
+    assert(qsize > 0); // any size <= 0 is illegal
+  }
+  bounded_queue(bounded_queue const &) = delete;
+  bounded_queue(bounded_queue &&) = delete;
+  bounded_queue &operator=(bounded_queue const &) = delete;
+  bounded_queue &operator=(bounded_queue &&) = delete;
+  ~bounded_queue() { delete[] elements; }
+  std::size_t size() { return qsize; }
+
+  template <typename... Args, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline void blocking_enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(enqidx)];
+    auto enq_tkt = ticket(enqidx);
+    while (enq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    ele.construct(std::forward<Args>(args)...);
+    ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline bool blocking_enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(enqidx)];
+    auto enq_tkt = ticket(enqidx);
+    while (enq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    if (ele.construct(std::forward<Args>(args)...)) {
+      ele.hasdata.store(true, std::memory_order_release);
+      ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+      return true;
+    } else {
+      ele.hasdata.store(false, std::memory_order_release);
+      ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+      return false;
+    }
+  }
+
+  template <typename... Args, // NON-SAFE
+            typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                    std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(enqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t enq_tkt = ticket(enqidx);
+      seq_t diff = tkt - enq_tkt;
+      if (diff == 0) {
+        if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed)) {
+          ele.construct(std::forward<Args>(args)...);
+          ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+          return true;
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is full
+      else
+        enqidx = enqueueIx.load(std::memory_order_acquire);
+    }
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                    !std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    for (;;) {
+      auto &ele = elements[index(enqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t enq_tkt = ticket(enqidx);
+      seq_t diff = tkt - enq_tkt;
+      if (diff == 0) {
+        if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1,
+                                              std::memory_order_release,
+                                              std::memory_order_relaxed)) {
+          if (ele.construct(std::forward<Args>(args)...)) {
+            ele.hasdata.store(true, std::memory_order_release);
+            ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+            return true;
+          } else {
+            ele.hasdata.store(false, std::memory_order_release);
+            ele.tkt.store(enq_tkt + 1, std::memory_order_release);
+            return false;
+          }
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is full
+      else
+        enqidx = enqueueIx.load(std::memory_order_acquire);
+    }
+  }
+
+  template <typename U = T, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<U>::value>::type>
+  inline void blocking_dequeue(U &data) noexcept {
+    auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(deqidx)];
+    seq_t deq_tkt = ticket(deqidx) + 1;
+    while (deq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    ele.move(data);
+    ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+  }
+
+  template <typename U = T, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<U>::value>::type>
+  inline bool blocking_dequeue(U &data) noexcept {
+    auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel);
+    auto &ele = elements[index(deqidx)];
+    seq_t deq_tkt = ticket(deqidx) + 1;
+    while (deq_tkt != ele.tkt.load(std::memory_order_acquire))
+      continue;
+    if (ele.hasdata.load(std::memory_order_acquire)) {
+      ele.move(data);
+      ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+      return true;
+    } else {
+      ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+      return false;
+    }
+  }
+
+  template <typename U = T, // NON-SAFE
+            typename std::enable_if<!TRAITS::NOEXCEPT_CHECK ||
+                                        std::is_nothrow_constructible<U>::value,
+                                    int>::type = 0>
+  inline bool dequeue(U &data) {
+
+    auto deqidx = dequeueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(deqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t deq_tkt = ticket(deqidx) + 1;
+      seq_t diff = tkt - deq_tkt;
+      if (diff == 0) {
+        if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1,
+                                              std::memory_order_acq_rel,
+                                              std::memory_order_relaxed)) {
+          ele.move(data);
+          ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+          return true;
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is empty
+      else {
+
+        deqidx = dequeueIx.load(std::memory_order_acquire);
+      }
+    }
+  }
+
+  template <
+      typename U = T, // SAFE-IMPL
+      typename std::enable_if<TRAITS::NOEXCEPT_CHECK &&
+                                  !std::is_nothrow_constructible<U>::value,
+                              int>::type = 0>
+  inline bool
+  dequeue(U &data) // false could be queue is empty, or skip an invalid element
+  {
+
+    auto deqidx = dequeueIx.load(std::memory_order_acquire);
+    for (;;) {
+      auto &ele = elements[index(deqidx)];
+      seq_t tkt = ele.tkt.load(std::memory_order_acquire);
+      seq_t deq_tkt = ticket(deqidx) + 1;
+      seq_t diff = tkt - deq_tkt;
+      if (diff == 0) {
+        if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1,
+                                              std::memory_order_acq_rel,
+                                              std::memory_order_relaxed)) {
+          if (ele.hasdata.load(std::memory_order_acquire)) {
+            ele.move(data);
+            ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+            return true;
+          } else {
+            ele.tkt.store(deq_tkt + 1, std::memory_order_release);
+            return false;
+          }
+        }
+      } else if (diff >= std::numeric_limits<seq_t>::max() / 2)
+        return false; // queue is empty
+      else {
+        deqidx = dequeueIx.load(std::memory_order_acquire);
+      }
+    }
+  }
+
+private:
+  inline seq_t index(seq_t const seq) {
+    if (fastmodulo)
+      return seq & mask;
+    else
+      return seq >= qsize ? seq % qsize : seq;
+  }
+
+  inline seq_t ticket(seq_t const seq) {
+    if (fastmodulo)
+      return (seq >> bitshift) << 1;
+    else
+      return (seq / static_cast<seq_t>(qsize)) << 1;
+  }
+  //TODO& Review: replace the following with c++ concepts
+  template <typename U = T, typename Enable = void> struct checkdata {};
+
+  template <typename U>
+  struct checkdata<U, typename std::enable_if<
+                          !TRAITS::NOEXCEPT_CHECK ||
+                          std::is_nothrow_constructible<U>::value>::type> {};
+
+  template <typename U>
+  struct checkdata<U, typename std::enable_if<
+                          TRAITS::NOEXCEPT_CHECK &&
+                          !std::is_nothrow_constructible<U>::value>::type> {
+    checkdata() : hasdata(false) {}
+    std::atomic<bool> hasdata;
+  };
+
+  struct element : public checkdata<T> {
+    element() : tkt(0) {}
+    ~element() {
+      if (tkt & 1) // enqueue op visited
+        destruct();
+    }
+
+    template <typename... Args, // NON-SAFE
+              typename = typename std::enable_if<
+                  !TRAITS::NOEXCEPT_CHECK ||
+                  std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline void construct(Args &&... args) noexcept {
+      new (&storage) T(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args, // SAFE-IMPL
+              typename = typename std::enable_if<
+                  TRAITS::NOEXCEPT_CHECK &&
+                  !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline bool construct(Args &&... args) noexcept {
+      try {
+        new (&storage) T(std::forward<Args>(args)...);
+      } catch (...) {
+        return false;
+      }
+      return true;
+    }
+
+    inline void destruct() noexcept { reinterpret_cast<T *>(&storage)->~T(); }
+
+    inline T *getptr() { return reinterpret_cast<T *>(&storage); }
+
+    template <
+        typename U = T, // NON-SAFE
+        typename std::enable_if<!TRAITS::NOEXCEPT_CHECK ||
+                                    std::is_nothrow_move_assignable<U>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      data = std::move(*getptr());
+      destruct();
+    }
+
+    template <
+        typename U = T, // SAFE-IMPL
+        typename std::enable_if<TRAITS::NOEXCEPT_CHECK &&
+                                    !std::is_nothrow_move_assignable<U>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      try {
+        data = std::move(*getptr());
+      } catch (...) {
+      }
+      destruct();
+    }
+
+    std::atomic<seq_t> tkt;
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type storage;
+    std::atomic<bool> hasdata;
+  };
+
+  bool const fastmodulo;   // true if qsize is power of 2
+  int const bitshift;      // used if fastmodulo is true
+  element *const elements; // pointer to buffer
+  std::size_t const mask;       // used if fastmodulo is true
+  std::size_t const qsize;      // queue size
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> enqueueIx;
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> dequeueIx;
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
+};
+} // namespace async
diff --git a/src/3rdparty/async/queue.h b/src/3rdparty/async/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b00d1d61fd18ff0278cc43c11ddccb131bef930
--- /dev/null
+++ b/src/3rdparty/async/queue.h
@@ -0,0 +1,429 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+#include "utility.h"
+#include <array>
+#include <atomic>
+#include <memory>
+
+namespace async {
+struct traits // 3-level (L3, L2, L1) depth of nested group design, total
+              // indexing space is pow(2, 64-Tagbits)
+{             // user can change the bits settings by providing your own TRAITS
+  static constexpr std::uint64_t Tagbits = 24;
+  static constexpr std::uint64_t L3bits = 10;
+  static constexpr std::uint64_t L2bits = 10;
+  static constexpr std::uint64_t L1bits = 12;
+  static constexpr std::uint64_t Basebits = 8;
+  static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
+  static constexpr std::size_t CachelineSize = 64;
+  static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
+};
+
+template <typename T, typename TRAITS = traits> class queue final {
+public:
+  static bool is_lock_free_v() {
+    return std::atomic<std::uint64_t>{}.is_lock_free();
+  }
+  static constexpr std::size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment;
+  static constexpr std::uint64_t BaseMask = getBitmask<std::uint64_t>(TRAITS::Basebits);
+  static constexpr std::uint64_t L1Mask = getBitmask<std::uint64_t>(TRAITS::L1bits)
+                                     << TRAITS::Basebits;
+  static constexpr std::uint64_t L2Mask = getBitmask<std::uint64_t>(TRAITS::L2bits)
+                                     << (TRAITS::Basebits + TRAITS::L1bits);
+  static constexpr std::uint64_t L3Mask =
+      getBitmask<std::uint64_t>(TRAITS::L3bits)
+      << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits);
+  static constexpr std::uint64_t TagMask =
+      getBitmask<std::uint64_t>(TRAITS::Tagbits)
+      << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits + TRAITS::L3bits);
+  static constexpr std::uint64_t TagShift = 64 - TRAITS::Tagbits;
+  static constexpr std::uint64_t TagPlus1 = static_cast<std::uint64_t>(1) << TagShift;
+
+public: // assert bits settings meet requirements
+  static_assert(TRAITS::Tagbits + TRAITS::L3bits + TRAITS::L2bits +
+                        TRAITS::L1bits + TRAITS::Basebits ==
+                    64,
+                "The sum of all bits settings should be 64");
+  static_assert(TRAITS::Tagbits > 0 && TRAITS::L3bits > 0 &&
+                    TRAITS::L2bits > 0 && TRAITS::L1bits > 0 &&
+                    TRAITS::Basebits > 3,
+                "All bits settings should be > 0 and Basebits must be > 3");
+  static_assert(std::is_nothrow_destructible<T>::value,
+                "T must be nothrow destructible");
+
+public:
+  queue() : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) {
+    container.get(index(0)); // allocate initial space
+  }
+  queue(std::size_t size) // pre-allocate size
+      : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) {
+    container.get(index(0));
+
+    if (size > (static_cast<std::uint64_t>(1) << TRAITS::Basebits)) {
+      index ix;
+      for (std::size_t i = (static_cast<std::uint64_t>(1) << TRAITS::Basebits); i < size;
+           ++i) {
+        auto &node = getNode(ix);
+        recycle(ix);
+      }
+    }
+  }
+
+  queue(queue const &other) = delete;
+  queue &operator=(queue const &other) = delete;
+  queue(queue &&other) = delete;
+  queue &operator=(queue &&other) = delete;
+
+  template <typename... Args, // NON-SAFE
+            typename = typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline void enqueue(Args &&... args) noexcept {
+    auto ix = encapsulate(std::forward<Args>(args)...);
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(ix, std::memory_order_release);
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename = typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+  inline bool enqueue(Args &&... args) noexcept {
+    auto ix = encapsulate(std::forward<Args>(args)...);
+    if (ix == 0)
+      return false;
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(ix, std::memory_order_release);
+    return true;
+  }
+
+  template <typename IT> void bulk_enqueue(IT it, std::size_t count) {
+    index firstidx(0), preidx(0), lastidx(0);
+    for (std::size_t i = 0; i < count; ++i) {
+      lastidx = encapsulate(*it++);
+      if (firstidx == 0)
+        firstidx = lastidx;
+      if (preidx != 0) {
+        container[preidx].next.store(lastidx, std::memory_order_relaxed);
+      }
+      preidx = lastidx;
+    }
+    auto enqidx = enqueueIx.load(std::memory_order_relaxed);
+    while (!enqueueIx.compare_exchange_weak(
+        enqidx, lastidx, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[enqidx].next.store(firstidx, std::memory_order_release);
+  }
+
+  template <typename IT>
+  std::size_t bulk_dequeue(IT &&it, std::size_t maxcount) // or IT& it to return the
+  {
+    std::size_t count(0);
+    while (maxcount-- && dequeue(*it++)) {
+      ++count;
+    }
+    return count;
+  }
+
+  template <typename U> // U could be T, or any kinds of iterators/adapters,
+                        // like insert_iterator
+  inline bool dequeue(U &data) noexcept // return false if queue is empty
+  {
+    for (;;) {
+      auto deqidx = dequeueIx.load(std::memory_order_acquire);
+      auto &node = container[deqidx];
+      auto next = node.next.load(std::memory_order_relaxed);
+      if (next == 0) {
+        auto ready_for_consume =
+            node.consume_ready.load(std::memory_order_relaxed);
+        if (!ready_for_consume) {
+          return false;
+        }
+
+        if (node.consume_ready.compare_exchange_strong(
+                ready_for_consume, false, std::memory_order_release,
+                std::memory_order_relaxed)) {
+          node.template move<TRAITS>(data);
+          return true;
+        }
+      } else {
+        if (dequeueIx.compare_exchange_weak(deqidx, next,
+                                            std::memory_order_acq_rel,
+                                            std::memory_order_relaxed)) {
+          auto ready_for_consume =
+              node.consume_ready.load(std::memory_order_acquire);
+          if (ready_for_consume &&
+              node.consume_ready.compare_exchange_strong(
+                  ready_for_consume, false, std::memory_order_release,
+                  std::memory_order_relaxed)) {
+            node.template move<TRAITS>(data);
+          } else { // the node is being consumed by another thread, waiting for
+                   // it finishes
+            for (; !node.recycle_ready.load(std::memory_order_acquire);) {
+            }
+          }
+          node.next.store(
+              0, std::memory_order_relaxed); // reset link to avoid chain effect
+          recycle(deqidx);
+          if (ready_for_consume)
+            return ready_for_consume;
+        }
+      }
+    }
+  }
+  std::uint64_t getNodeCount() { return nodeCount; } // get in-use-nodes count
+
+private:       // internal data structures
+  struct index // simulate tagged pointer
+  {
+    index(std::uint64_t newval) noexcept
+        : value(newval) {} // is_trivially_copyable must be true
+    index() noexcept : value(0) {}
+    inline operator std::uint64_t() const { return value; }
+    std::uint64_t getVersion() { return (value & TagMask) >> TagShift; }
+    inline void increTag() {
+      value = (value & ~TagMask) | ((value + TagPlus1) & TagMask);
+    }
+    std::uint64_t value;
+  };
+
+  struct node // to store the data
+  {
+    node() : next(0), consume_ready(false), recycle_ready(true) {}
+    ~node() noexcept {
+      if (consume_ready.load(std::memory_order_relaxed)) {
+        destruct();
+      }
+    }
+
+    template <typename... Args, // NON-SAFE
+              typename = typename std::enable_if<
+                  !TRAITS::NOEXCEPT_CHECK ||
+                  std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline void construct(Args &&... args) noexcept {
+      new (&storage) T(std::forward<Args>(args)...);
+      consume_ready.store(true, std::memory_order_release);
+      recycle_ready.store(false, std::memory_order_release);
+    }
+
+    template <typename... Args, // SAFE-IMPL
+              typename = typename std::enable_if<
+                  TRAITS::NOEXCEPT_CHECK &&
+                  !std::is_nothrow_constructible<T, Args &&...>::value>::type>
+    inline bool construct(Args &&... args) noexcept {
+      try {
+        new (&storage) T(std::forward<Args>(args)...);
+      } catch (...) {
+        return false;
+      }
+
+      consume_ready.store(true, std::memory_order_release);
+      recycle_ready.store(false, std::memory_order_release);
+      return true;
+    }
+
+    inline void destruct() noexcept { reinterpret_cast<T *>(&storage)->~T(); }
+
+    template <
+        typename TR, typename U, // NON-SAFE
+        typename std::enable_if<!TR::NOEXCEPT_CHECK ||
+                                    std::is_nothrow_move_assignable<T>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      data = std::move(*getptr());
+      destruct();
+      recycle_ready.store(true, std::memory_order_release);
+    }
+
+    template <
+        typename TR, typename U, // SAFE-IMPL
+        typename std::enable_if<TR::NOEXCEPT_CHECK &&
+                                    !std::is_nothrow_move_assignable<T>::value,
+                                int>::type = 0>
+    inline void move(U &data) {
+      try {
+        data = std::move(*getptr());
+      } catch (...) {
+      }
+      destruct();
+      recycle_ready.store(true, std::memory_order_release);
+    }
+    inline T *getptr() { return reinterpret_cast<T *>(&storage); }
+    std::atomic<index> next;         // link
+    std::atomic<bool> consume_ready; // if true, consume ready
+    std::atomic<bool> recycle_ready; // if true, recycle ready
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type storage; // data
+  };
+
+  struct basecontainer {
+    inline node &get(index const &ix) { return operator[](ix); }
+    inline node &at(index const &ix) { return operator[](ix); }
+    inline node &operator[](index const &ix) { return nodes[ix & BaseMask]; }
+    std::array<node, static_cast<std::uint64_t>(1) << TRAITS::Basebits> nodes;
+  };
+
+  template <typename SubGroup, std::uint64_t BitMask> struct nestedcontainer {
+    static constexpr std::uint64_t mask = BitMask;
+    static constexpr std::uint64_t bits = getSetBitsCount(mask);
+    static constexpr std::uint64_t shift = getShiftBitsCount(mask);
+    std::array<std::atomic<SubGroup *>, static_cast<std::uint64_t>(1) << bits>
+        subgroups;
+    nestedcontainer() {
+      for (auto &gptr : subgroups) {
+        gptr.store(nullptr, std::memory_order_release);
+      }
+    }
+    ~nestedcontainer() {
+      for (auto &gptr : subgroups) {
+        if (gptr.load(std::memory_order_relaxed) != nullptr)
+          delete gptr.load(std::memory_order_relaxed);
+      }
+    }
+
+    inline node &get(index const &ix) // will trigger the new operation if
+                                      // subgroup doesn't exist
+    {
+      auto ptr =
+          subgroups[(ix & mask) >> shift].load(std::memory_order_acquire);
+      if (ptr == nullptr) {
+        auto newgroup = std::make_unique<SubGroup>(); // if ComExch fails,
+                                                      // unique_ptr will self
+                                                      // delete
+        if (subgroups[(ix & mask) >> shift].compare_exchange_strong(
+                ptr, newgroup.get(), std::memory_order_release,
+                std::memory_order_acquire)) {
+          ptr = newgroup.release();
+        }
+      }
+      return ptr->get(ix); // recursively calling get 'til get the node
+    }
+
+    inline node &operator[](index const &ix) {
+      return subgroups[(ix & mask) >> shift]
+          .load(std::memory_order_relaxed)
+          ->
+          operator[](ix);
+    }
+
+    inline node &at(index const &ix) { // balanced performance and safety
+      auto ptr =
+          subgroups[(ix & mask) >> shift].load(std::memory_order_relaxed);
+      if (ptr)
+        return ptr->at(ix);
+      else
+        return get(ix);
+    }
+  };
+
+  inline node &getNode(index &ix) { // return an existing or new node
+    #if defined(__arm__) && (!defined(__aarch64__))
+    //for ARMV7 or below
+    ix.value = nodeCount.load(std::memory_order_relaxed);
+    auto val = ix.value + 1;
+    while(!nodeCount.compare_exchange_weak(
+      ix.value, val, std::memory_order_release, std::memory_order_relaxed)) {
+        val = ix.value + 1;
+    }
+    #else
+    ix.value = nodeCount.fetch_add(static_cast<std::uint64_t>(1),
+                              std::memory_order_relaxed);
+    #endif
+    if ((ix.value & BaseMask) == 0)
+      return container.get(ix);
+    else
+      return container.at(ix);
+  }
+
+  template <typename... Args, // NON-SAFE
+            typename std::enable_if<
+                !TRAITS::NOEXCEPT_CHECK ||
+                    std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline index encapsulate(Args &&... args) noexcept {
+    auto ix = spawn();
+    auto &node = container[ix];
+    node.construct(std::forward<Args>(args)...);
+    node.next.store(0, std::memory_order_relaxed);
+    return ix;
+  }
+
+  template <typename... Args, // SAFE-IMPL
+            typename std::enable_if<
+                TRAITS::NOEXCEPT_CHECK &&
+                    !std::is_nothrow_constructible<T, Args &&...>::value,
+                int>::type = 0>
+  inline index encapsulate(Args &&... args) noexcept {
+    auto ix = spawn();
+    auto &node = container[ix];
+    node.next.store(0, std::memory_order_relaxed);
+    if (node.construct(std::forward<Args>(args)...))
+      return ix;
+    else {
+      recycle(ix); // construction failed, recycle the node
+      return index(0);
+    }
+  }
+
+  inline void recycle(index const &ix) {
+    auto recycle = recycleIx.load(std::memory_order_relaxed);
+    while (!recycleIx.compare_exchange_weak(
+        recycle, ix, std::memory_order_release, std::memory_order_relaxed))
+      continue;
+    container[recycle].next.store(ix, std::memory_order_release);
+  }
+
+  inline auto spawn()
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER < 1800)
+      -> index
+#endif
+  {
+    index ix(0);
+    for (;;) {
+      auto spaidx = spawnIx.load(std::memory_order_acquire);
+      auto next = container[spaidx].next.load(std::memory_order_relaxed);
+      if (next == 0) {
+        getNode(ix);
+        return ix;
+      } else {
+        if (spawnIx.compare_exchange_weak(spaidx, next,
+                                          std::memory_order_acq_rel,
+                                          std::memory_order_relaxed)) {
+          if (spaidx != 0) {
+            spaidx.increTag();
+          }
+          return spaidx;
+        }
+      }
+    }
+  }
+ 
+  using L1container = nestedcontainer<basecontainer, L1Mask>;
+  using L2container = nestedcontainer<L1container, L2Mask>;
+  nestedcontainer<L2container, L3Mask> container;
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<std::uint64_t> nodeCount; // # of allocated nodes, not the #
+                                                                // of elements stored in the queue
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> dequeueIx;    // dequeue pointer
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> enqueueIx;    // enqueue pointer
+  alignas(cacheline_alignment) char cacheline_padding4[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> spawnIx;      // spawn pointer
+  alignas(cacheline_alignment) char cacheline_padding5[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> recycleIx;    // recycle pointer
+  alignas(cacheline_alignment) char cacheline_padding6[cacheline_size];
+};
+} // namespace async
diff --git a/src/3rdparty/async/threadpool.h b/src/3rdparty/async/threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a9d85041b5631d8a79d3a0721db0a8bb091c8
--- /dev/null
+++ b/src/3rdparty/async/threadpool.h
@@ -0,0 +1,192 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+#include "queue.h"
+#include <atomic>
+#include <functional>
+#include <future>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+namespace async {
+// thread pool to execute functions, functors, lamdas asynchronously,
+// default poolsize = machine's logical CPU cores/threads
+class threadpool final {
+public:
+  static int defaultpoolsize() { return std::thread::hardware_concurrency(); }
+
+  threadpool(int poolsize = defaultpoolsize())
+      : idlecount(0), conflag(false) {
+    configurepool(poolsize);
+  }
+
+  threadpool(const threadpool &) = delete;
+  threadpool(threadpool &&) = delete;
+  threadpool &operator=(const threadpool &) = delete;
+  threadpool &operator=(threadpool &&) = delete;
+
+  ~threadpool() { cleanup(); }
+
+  inline std::size_t size() {
+    std::lock_guard<std::mutex> lg(poolmux);
+    return threads.size();
+  }
+
+  inline int idlesize() { return idlecount; }
+
+  // can be called to resize the pool at any time after construction and before
+  // destruction, recommand to be called from main thread or manager thread even
+  // though it is thread-safe
+  void configurepool(std::size_t poolsize) {
+    std::unique_lock<std::mutex> veclk(poolmux);
+    auto currentsize = threads.size();
+    if (currentsize < poolsize) { // expand the pool
+      for (std::size_t i = currentsize; i < poolsize; i++) {
+        tpstops.emplace_back(addthread());
+      }
+    } else if (currentsize > poolsize) { // shrink the pool
+      std::vector<std::unique_ptr<std::thread>> dumpthreads;
+      std::vector<std::atomic<bool> *> dumpthreadstops;
+      std::move(threads.begin() + poolsize, threads.end(),
+                std::back_inserter(dumpthreads));
+      std::move(tpstops.begin() + poolsize, tpstops.end(),
+                std::back_inserter(dumpthreadstops));
+      tpstops.resize(poolsize);
+      threads.resize(poolsize);
+      veclk.unlock();
+      for (auto &a : dumpthreadstops) {
+        *a = true;
+      }
+      for (auto &t : dumpthreads) {
+        t->detach();
+      }
+      {
+        std::unique_lock<std::mutex> lk(qcvmux); // suspended threads to quit
+        qcv.notify_all();
+      }
+    }
+  }
+
+  template <typename Func, typename... Args>
+  inline auto post(Func &&func, Args &&... args)
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER <= 1800)
+      -> std::future<typename std::result_of<Func(Args...)>::type>
+#endif
+  { // TODO: replace result_of with invoke_result_t when migrate to c++17
+    auto taskptr = std::make_shared<
+        std::packaged_task<typename std::result_of<Func(Args...)>::type()>>(
+        std::bind(std::forward<Func>(func), std::forward<Args>(args)...));
+    taskqueue.enqueue([taskptr]() { (*taskptr)(); });
+    {
+      std::lock_guard<std::mutex> lg(qcvmux);
+      conflag = true;
+    }
+    qcv.notify_one();
+    return taskptr->get_future();
+  }
+
+  template <typename Func>
+  inline auto post(Func &&func)
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) ||   \
+    (defined(_MSC_VER) && _MSC_VER <= 1800)
+      -> std::future<typename std::result_of<Func()>::type>
+#endif
+  { // a special case for func() type without any parameters, might be
+    // removed later
+    auto taskptr = std::make_shared<
+        std::packaged_task<typename std::result_of<Func()>::type()>>(
+        std::forward<Func>(func));
+    taskqueue.enqueue([taskptr]() { (*taskptr)(); });
+    {
+      std::lock_guard<std::mutex> lg(qcvmux);
+      conflag = true;
+    }
+    qcv.notify_one();
+    return taskptr->get_future();
+  }
+
+private:
+  struct executor {
+    executor(std::unique_ptr<std::atomic<bool>> &&ptr, threadpool &pool)
+        : stop(std::move(ptr)), thpool(pool) {}
+    void operator()() {
+      while (!*stop) {
+        if (!thpool.executetask_in_loop(*stop)) {
+          return; // signaled to quit
+        }
+        thpool.wait_for_task(*stop); // wait for new task
+      }
+    }
+
+  private:
+    std::unique_ptr<std::atomic<bool>> stop;
+    threadpool &thpool;
+  };
+
+  std::atomic<bool> *addthread() {
+    auto stopuniptr = std::make_unique<std::atomic<bool>>(false);
+    auto stoprawptr = stopuniptr.get();
+    threads.emplace_back(
+        std::make_unique<std::thread>(executor(std::move(stopuniptr), *this)));
+    return stoprawptr;
+  }
+
+  void cleanup() { // make sure no more tasks being pushed to the taskqueue
+    {
+      std::lock_guard<std::mutex> lk(qcvmux);
+      qcv.notify_all(); // let running thread drain the task queue? no need,
+                        // should be removed
+    }
+    for (auto &stop : tpstops) {
+      *stop = true; // stop signaled
+    }
+    {
+      std::lock_guard<std::mutex> lk(qcvmux);
+      qcv.notify_all(); // notify again
+    }
+    for (auto &thread : threads) {
+      if (thread->joinable())
+        thread->join();
+    }
+    threads.clear();
+    tpstops.clear();
+  }
+
+  inline void wait_for_task(std::atomic<bool> const &stop) {
+    idlecount.fetch_add(1, std::memory_order_relaxed);
+    {
+      std::unique_lock<std::mutex> lk(qcvmux);
+      qcv.wait(lk, [&]() {
+        return conflag || stop.load(std::memory_order_acquire);
+      }); //memory_oder can be removed
+      conflag = false;
+    }
+    idlecount.fetch_sub(1, std::memory_order_relaxed);
+  }
+
+  inline bool executetask_in_loop(std::atomic<bool> const &stop) {
+    std::function<void()> func;
+    for (; taskqueue.dequeue(func);) {
+      func();
+      if (stop) // stop is signaled
+        return false;
+    }
+    return true;
+  }
+
+  std::vector<std::unique_ptr<std::thread>> threads;
+  std::vector<std::atomic<bool> *> tpstops; // threads terminate flags
+  async::queue<std::function<void()>> taskqueue;
+  std::atomic<int> idlecount; // idle thread count
+  std::mutex qcvmux, poolmux;
+  std::condition_variable qcv;
+  bool conflag; // continue flag for cv
+};
+} // namespace async
diff --git a/src/3rdparty/async/utility.h b/src/3rdparty/async/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5bb2d1f45e41e397645af522fd05889a6422c03
--- /dev/null
+++ b/src/3rdparty/async/utility.h
@@ -0,0 +1,66 @@
+/////////////////////////////////////////////////////////////////////
+//          Copyright Yibo Zhu 2017
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+/////////////////////////////////////////////////////////////////////
+#pragma once
+
+#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus < 201103L) ||    \
+    (defined(_MSC_VER) && _MSC_VER < 1800)
+#error This library needs at least a C++11 compliant compiler
+#endif
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <string>
+template <typename T> static constexpr T getBitmask(unsigned int const bits) {
+  return static_cast<T>(-(bits != 0)) &
+         (static_cast<T>(-1) >> ((sizeof(T) * CHAR_BIT) - bits));
+}
+
+#if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1910)
+// c++14 impl
+static constexpr unsigned int getSetBitsCount(std::uint64_t n) {
+  unsigned int count{0};
+  while (n) {
+    n &= (n - 1);
+    count++;
+  }
+  return count;
+}
+
+static constexpr unsigned int getShiftBitsCount(std::uint64_t n) {
+  // requires c++14
+  unsigned int count{0};
+  if (n == 0)
+    return count;
+  while ((n & 0x1) == 0) {
+    n >>= 1;
+    ++count;
+  }
+  return count;
+}
+
+#elif __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
+// c++11 impl
+static constexpr unsigned int getSetBitsCount(std::uint64_t n) {
+  return n == 0 ? 0 : 1 + getSetBitsCount(n & (n - 1));
+}
+
+static constexpr unsigned int getShiftBitsCount(std::uint64_t n) {
+  return n == 0 ? 0 : ((n & 0x1) == 0 ? 1 + getShiftBitsCount(n >> 1) : 0);
+}
+
+#if (__cplusplus == 201103L) && (defined(__clang__) || defined(__GNUC__))
+namespace std { // for c+11
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+} // namespace std
+#endif
+
+#else
+#error This library needs at least a C++11 compliant compiler
+#endif
diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2d67730ae88d93001a0652ef14d3448c39226a9
--- /dev/null
+++ b/src/3rdparty/cctbx/pystreambuf.h
@@ -0,0 +1,519 @@
+/* Original code: https://gist.github.com/asford/544323a5da7dddad2c9174490eb5ed06
+ * License:
+ * This component utilizes components derived from cctbx, available at
+ * http://cci.lbl.gov/cctbx_sources/boost_adaptbx/python_streambuf.h
+ *
+ * *** License agreement ***
+ *
+ * cctbx Copyright (c) 2006, The Regents of the University of
+ * California, through Lawrence Berkeley National Laboratory (subject to
+ * receipt of any required approvals from the U.S. Dept. of Energy).  All
+ * rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * (1) Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * (2) Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * (3) Neither the name of the University of California, Lawrence Berkeley
+ * National Laboratory, U.S. Dept. of Energy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You are under no obligation whatsoever to provide any bug fixes,
+ * patches, or upgrades to the features, functionality or performance of
+ * the source code ("Enhancements") to anyone; however, if you choose to
+ * make your Enhancements available either publicly, or directly to
+ * Lawrence Berkeley National Laboratory, without imposing a separate
+ * written license agreement for such Enhancements, then you hereby grant
+ * the following license: a  non-exclusive, royalty-free perpetual license
+ * to install, use, modify, prepare derivative works, incorporate into
+ * other computer software, distribute, and sublicense such enhancements or
+ * derivative works thereof, in binary and source code form.
+*/
+
+#pragma once
+
+#include <streambuf>
+#include <iostream>
+
+#include <pybind11/pybind11.h>
+
+namespace pystreambuf {
+
+/// A stream buffer getting data from and putting data into a Python file object
+/** The aims are as follow:
+
+    - Given a C++ function acting on a standard stream, e.g.
+
+      \code
+      void read_inputs(std::istream& input) {
+        ...
+        input >> something >> something_else;
+      }
+      \endcode
+
+      and given a piece of Python code which creates a file-like object,
+      to be able to pass this file object to that C++ function, e.g.
+
+      \code
+      import gzip
+      gzip_file_obj = gzip.GzipFile(...)
+      read_inputs(gzip_file_obj)
+      \endcode
+
+      and have the standard stream pull data from and put data into the Python
+      file object.
+
+    - When Python \c read_inputs() returns, the Python object is able to
+      continue reading or writing where the C++ code left off.
+
+    - Operations in C++ on mere files should be competitively fast compared
+      to the direct use of \c std::fstream.
+
+
+    \b Motivation
+
+      - the standard Python library offer of file-like objects (files,
+        compressed files and archives, network, ...) is far superior to the
+        offer of streams in the C++ standard library and Boost C++ libraries.
+
+      - i/o code involves a fair amount of text processing which is more
+        efficiently prototyped in Python but then one may need to rewrite
+        a time-critical part in C++, in as seamless a manner as possible.
+
+    \b Usage
+
+    This is 2-step:
+
+      - a trivial wrapper function
+
+        \code
+          using boost_adaptbx::python::streambuf;
+          void read_inputs_wrapper(streambuf& input)
+          {
+            streambuf::istream is(input);
+            read_inputs(is);
+          }
+
+          def("read_inputs", read_inputs_wrapper);
+        \endcode
+
+        which has to be written every time one wants a Python binding for
+        such a C++ function.
+
+      - the Python side
+
+        \code
+          from boost.python import streambuf
+          read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
+        \endcode
+
+        \c buffer_size is optional. See also: \c default_buffer_size
+
+  Note: references are to the C++ standard (the numbers between parentheses
+  at the end of references are margin markers).
+*/
+class streambuf : public std::basic_streambuf<char>
+{
+  private:
+    typedef std::basic_streambuf<char> base_t;
+
+  public:
+    /* The syntax
+        using base_t::char_type;
+       would be nicer but Visual Studio C++ 8 chokes on it
+    */
+    typedef base_t::char_type   char_type;
+    typedef base_t::int_type    int_type;
+    typedef base_t::pos_type    pos_type;
+    typedef base_t::off_type    off_type;
+    typedef base_t::traits_type traits_type;
+
+    /// The default size of the read and write buffer.
+    /** They are respectively used to buffer data read from and data written to
+        the Python file object. It can be modified from Python.
+    */
+    static constexpr std::size_t default_buffer_size = 1024;
+
+    /// Construct from a Python file object
+    /** if buffer_size is 0 the current default_buffer_size is used.
+    */
+    streambuf(
+      pybind11::object& python_file_obj,
+      std::size_t buffer_size_=0)
+    :
+      py_read (getattr(python_file_obj, "read", pybind11::none())),
+      py_write (getattr(python_file_obj, "write", pybind11::none())),
+      py_seek (getattr(python_file_obj, "seek", pybind11::none())),
+      py_tell (getattr(python_file_obj, "tell", pybind11::none())),
+      buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
+      write_buffer(0),
+      pos_of_read_buffer_end_in_py_file(0),
+      pos_of_write_buffer_end_in_py_file(buffer_size),
+      farthest_pptr(0)
+    {
+      assert(buffer_size != 0);
+      /* Some Python file objects (e.g. sys.stdout and sys.stdin)
+         have non-functional seek and tell. If so, assign None to
+         py_tell and py_seek.
+       */
+      if (!py_tell.is_none()) {
+        try {
+          py_tell();
+        }
+        catch (pybind11::error_already_set& err) {
+          py_tell = pybind11::none();
+          py_seek = pybind11::none();
+          err.restore();
+          PyErr_Clear();
+        }
+      }
+
+      if (!py_write.is_none()) {
+        // add one extra byte for characters passed to the overflow() method
+        write_buffer = new char[buffer_size + 1];
+        setp(write_buffer, write_buffer + buffer_size);  // 27.5.2.4.5 (5)
+        farthest_pptr = pptr();
+      }
+      else {
+        // The first attempt at output will result in a call to overflow
+        setp(0, 0);
+      }
+
+      if (!py_tell.is_none()){
+        off_type py_pos = py_tell().cast<off_type>();
+        pos_of_read_buffer_end_in_py_file = py_pos;
+        pos_of_write_buffer_end_in_py_file = py_pos;
+      }
+    }
+
+    /// Mundane destructor freeing the allocated resources
+    virtual ~streambuf() {
+      if (write_buffer) delete[] write_buffer;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.3
+    /** It is essential to override this virtual function for the stream
+        member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
+     */
+    virtual std::streamsize showmanyc() {
+      int_type const failure = traits_type::eof();
+      int_type status = underflow();
+      if (status == failure) return -1;
+      return egptr() - gptr();
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.3
+    virtual int_type underflow() {
+      int_type const failure = traits_type::eof();
+      if (py_read.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'read' attribute");
+      }
+      read_buffer = py_read(buffer_size);
+      char *read_buffer_data;
+      pybind11::ssize_t py_n_read;
+      if (PYBIND11_BYTES_AS_STRING_AND_SIZE(read_buffer.ptr(),
+            &read_buffer_data, &py_n_read) == -1) {
+        setg(0, 0, 0);
+        throw std::invalid_argument(
+          "The method 'read' of the Python file object "
+          "did not return a string.");
+      }
+      off_type n_read = (off_type)py_n_read;
+      pos_of_read_buffer_end_in_py_file += n_read;
+      setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
+      // ^^^27.5.2.3.1 (4)
+      if (n_read == 0) return failure;
+      return traits_type::to_int_type(read_buffer_data[0]);
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.5
+    virtual int_type overflow(int_type c=traits_type::eof()) {
+      if (py_write.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'write' attribute");
+      }
+      farthest_pptr = std::max(farthest_pptr, pptr());
+      off_type n_written = (off_type)(farthest_pptr - pbase());
+      if (!traits_type::eq_int_type(c, traits_type::eof())) {
+        // add the overflown character to the end of the buffer
+        // (we have one extra byte just for that)
+        write_buffer[n_written++] = traits_type::to_char_type(c);
+      }
+      pybind11::bytes chunk(pbase(), n_written);
+      py_write(chunk);
+      if (n_written) {
+        pos_of_write_buffer_end_in_py_file += n_written;
+        setp(pbase(), epptr());
+        // ^^^ 27.5.2.4.5 (5)
+        farthest_pptr = pptr();
+      }
+      return traits_type::eq_int_type(
+        c, traits_type::eof()) ? traits_type::not_eof(c) : c;
+    }
+
+    /// Update the python file to reflect the state of this stream buffer
+    /** Empty the write buffer into the Python file object and set the seek
+        position of the latter accordingly (C++ standard section 27.5.2.4.2).
+        If there is no write buffer or it is empty, but there is a non-empty
+        read buffer, set the Python file object seek position to the
+        seek position in that read buffer.
+    */
+    virtual int sync() {
+      int result = 0;
+      farthest_pptr = std::max(farthest_pptr, pptr());
+      if (farthest_pptr && farthest_pptr > pbase()) {
+        off_type delta = pptr() - farthest_pptr;
+        int_type status = overflow();
+        if (traits_type::eq_int_type(status, traits_type::eof())) result = -1;
+        if (!py_seek.is_none()) py_seek(delta, 1);
+      }
+      else if (gptr() && gptr() < egptr()) {
+        if (!py_seek.is_none()) py_seek(gptr() - egptr(), 1);
+      }
+      return result;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.2
+    /** This implementation is optimised to look whether the position is within
+        the buffers, so as to avoid calling Python seek or tell. It is
+        important for many applications that the overhead of calling into Python
+        is avoided as much as possible (e.g. parsers which may do a lot of
+        backtracking)
+    */
+    virtual
+    pos_type seekoff(off_type off, std::ios_base::seekdir way,
+                     std::ios_base::openmode which=  std::ios_base::in
+                                                   | std::ios_base::out)
+    {
+      /* In practice, "which" is either std::ios_base::in or out
+         since we end up here because either seekp or seekg was called
+         on the stream using this buffer. That simplifies the code
+         in a few places.
+      */
+      int const failure = off_type(-1);
+
+      if (py_seek.is_none()) {
+        throw std::invalid_argument(
+          "That Python file object has no 'seek' attribute");
+      }
+
+      // we need the read buffer to contain something!
+      if (which == std::ios_base::in && !gptr()) {
+        if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
+          return failure;
+        }
+      }
+
+      // compute the whence parameter for Python seek
+      int whence;
+      switch (way) {
+        case std::ios_base::beg:
+          whence = 0;
+          break;
+        case std::ios_base::cur:
+          whence = 1;
+          break;
+        case std::ios_base::end:
+          whence = 2;
+          break;
+        default:
+          return failure;
+      }
+
+      // Let's have a go
+      off_type result;
+      if (!seekoff_without_calling_python(off, way, which, result)) {
+        // we need to call Python
+        if (which == std::ios_base::out) overflow();
+        if (way == std::ios_base::cur) {
+          if      (which == std::ios_base::in)  off -= egptr() - gptr();
+          else if (which == std::ios_base::out) off += pptr() - pbase();
+        }
+        py_seek(off, whence);
+        result = off_type(py_tell().cast<off_type>());
+        if (which == std::ios_base::in) underflow();
+      }
+      return result;
+    }
+
+    /// C.f. C++ standard section 27.5.2.4.2
+    virtual
+    pos_type seekpos(pos_type sp,
+                     std::ios_base::openmode which=  std::ios_base::in
+                                                   | std::ios_base::out)
+    {
+      return streambuf::seekoff(sp, std::ios_base::beg, which);
+    }
+
+  private:
+    pybind11::object py_read, py_write, py_seek, py_tell;
+
+    std::size_t buffer_size;
+
+    /* This is actually a Python bytes object and the actual read buffer is
+       its internal data, i.e. an array of characters.
+     */
+    pybind11::bytes read_buffer;
+
+    /* A mere array of char's allocated on the heap at construction time and
+       de-allocated only at destruction time.
+    */
+    char *write_buffer;
+
+    off_type pos_of_read_buffer_end_in_py_file,
+             pos_of_write_buffer_end_in_py_file;
+
+    // the farthest place the buffer has been written into
+    char *farthest_pptr;
+
+
+    bool seekoff_without_calling_python(
+      off_type off,
+      std::ios_base::seekdir way,
+      std::ios_base::openmode which,
+      off_type & result)
+    {
+      // Buffer range and current position
+      off_type buf_begin, buf_end, buf_cur, upper_bound;
+      off_type pos_of_buffer_end_in_py_file;
+      if (which == std::ios_base::in) {
+        pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
+        buf_begin = reinterpret_cast<std::streamsize>(eback());
+        buf_cur = reinterpret_cast<std::streamsize>(gptr());
+        buf_end = reinterpret_cast<std::streamsize>(egptr());
+        upper_bound = buf_end;
+      }
+      else if (which == std::ios_base::out) {
+        pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
+        buf_begin = reinterpret_cast<std::streamsize>(pbase());
+        buf_cur = reinterpret_cast<std::streamsize>(pptr());
+        buf_end = reinterpret_cast<std::streamsize>(epptr());
+        farthest_pptr = std::max(farthest_pptr, pptr());
+        upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
+      }
+      else {
+           std::runtime_error(
+             "Control flow passes through branch that should be unreachable.");
+      }
+
+      // Sought position in "buffer coordinate"
+      off_type buf_sought;
+      if (way == std::ios_base::cur) {
+        buf_sought = buf_cur + off;
+      }
+      else if (way == std::ios_base::beg) {
+        buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
+      }
+      else if (way == std::ios_base::end) {
+        return false;
+      }
+      else {
+           std::runtime_error(
+             "Control flow passes through branch that should be unreachable.");
+      }
+
+      // if the sought position is not in the buffer, give up
+      if (buf_sought < buf_begin || buf_sought >= upper_bound) return false;
+
+      // we are in wonderland
+      if      (which == std::ios_base::in)  gbump(buf_sought - buf_cur);
+      else if (which == std::ios_base::out) pbump(buf_sought - buf_cur);
+
+      result = pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
+      return true;
+    }
+
+  public:
+
+    class istream : public std::istream
+    {
+      public:
+        istream(streambuf& buf) : std::istream(&buf)
+        {
+          exceptions(std::ios_base::badbit | std::ios_base::failbit);
+        }
+
+        ~istream() { if (this->good()) this->sync(); }
+    };
+
+    class ostream : public std::ostream
+    {
+      public:
+        ostream(streambuf& buf) : std::ostream(&buf)
+        {
+          exceptions(std::ios_base::badbit | std::ios_base::failbit);
+        }
+
+        ~ostream() { if (this->good()) this->flush(); }
+    };
+};
+
+struct streambuf_capsule
+{
+  streambuf python_streambuf;
+
+  streambuf_capsule(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    python_streambuf(python_file_obj, buffer_size)
+  {}
+};
+
+struct ostream : private streambuf_capsule, streambuf::ostream
+{
+  ostream(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    streambuf_capsule(python_file_obj, buffer_size),
+    streambuf::ostream(python_streambuf)
+  {}
+
+  ~ostream()
+  {
+    if (this->good()){
+      this->flush();
+    }
+  }
+};
+
+struct istream : private streambuf_capsule, streambuf::istream
+{
+  istream(
+    pybind11::object& python_file_obj,
+    std::size_t buffer_size=0)
+  :
+    streambuf_capsule(python_file_obj, buffer_size),
+    streambuf::istream(python_streambuf)
+  {}
+
+  ~istream()
+  {
+    if (this->good()) {
+      this->sync();
+    }
+  }
+};
+
+} // namespace pystreambuf
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index cbd628b03e13f9c792e2b9dd90b5573ea3ea4568..2b2389e2c02986a7e4c13751b844c19c1ef2b17d 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -26,7 +26,7 @@
 #include <TNL/SystemInfo.h>
 #include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Config/ConfigDescription.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Benchmarks {
@@ -55,7 +55,7 @@ struct BenchmarkResult
       elements << time << stddev << stddev / time << bandwidth;
       if( speedup != 0 )
          elements << speedup;
-      else 
+      else
          elements << "N/A";
       return elements;
    }
@@ -356,9 +356,7 @@ inline Benchmark::MetadataMap getHardwareMetadata()
        { "system release", SystemInfo::getSystemRelease() },
        { "start time", SystemInfo::getCurrentTime() },
 #ifdef HAVE_MPI
-       { "number of MPI processes", convertToString( (Communicators::MpiCommunicator::IsInitialized())
-                                       ? Communicators::MpiCommunicator::GetSize( Communicators::MpiCommunicator::AllGroup )
-                                       : 1 ) },
+       { "number of MPI processes", convertToString( TNL::MPI::GetSize() ) },
 #endif
        { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) },
        { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) },
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 683e6960ad2f4c9e93adffa95f558b17edbf64aa..e8b5c9de15692c0de5a0ad30cc8b3762a05f76ef 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -19,9 +19,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -39,12 +38,6 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index,
 using namespace TNL;
 using namespace TNL::Benchmarks;
 
-#ifdef HAVE_MPI
-using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
-
 
 template< typename Matrix, typename Vector >
 void
@@ -115,7 +108,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
    // benchmark function
    auto compute = [&]() {
       matrix.vectorProduct( x, y );
-      Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() );
+      TNL::MPI::Barrier( matrix.getCommunicationGroup() );
    };
 
    benchmark.time< typename Matrix::DeviceType >( reset, performer, compute );
@@ -155,9 +148,9 @@ struct SpmvBenchmark
    using IndexType = typename MatrixType::IndexType;
    using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
 
-   using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >;
-   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using Partitioner = Containers::Partitioner< IndexType >;
+   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
    using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
 
    static bool
@@ -174,7 +167,7 @@ struct SpmvBenchmark
       matrix.getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (CommunicatorType::isDistributed()) ? "DistSpMV" : "SpMV" )
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       benchmark.setMetadataColumns( Benchmark::MetadataColumns({
@@ -194,13 +187,13 @@ struct SpmvBenchmark
          getTrivialOrdering( matrix, perm, iperm );
          MatrixType matrix_perm;
          Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm );
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix_perm, vector );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector );
       }
       else {
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix, vector );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix, vector );
@@ -230,13 +223,13 @@ struct SpmvBenchmark
                    VectorType& vector )
    {
       // set up the distributed matrix
-      const auto group = CommunicatorType::AllGroup;
+      const auto group = TNL::MPI::AllGroup();
       const auto localRange = Partitioner::splitRange( matrix.getRows(), group );
       DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group );
-      DistributedVector distributedVector( localRange, matrix.getRows(), group );
+      DistributedVector distributedVector( localRange, 0, matrix.getRows(), group );
 
       // copy the row lengths from the global matrix to the distributed matrix
-      DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group );
+      DistributedRowLengths distributedRowLengths( localRange, 0, matrix.getRows(), group );
       for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
          distributedRowLengths[ gi ] = matrix.getRowCapacity( gi );
@@ -272,8 +265,8 @@ struct SpmvBenchmark
       DistributedVector distributedY;
       distributedY.setLike( distributedVector );
       distributedMatrix.vectorProduct( distributedVector, distributedY );
-      const int rank = CommunicatorType::GetRank( distributedMatrix.getCommunicationGroup() );
-      const int nproc = CommunicatorType::GetSize( distributedMatrix.getCommunicationGroup() );
+      const int rank = TNL::MPI::GetRank( distributedMatrix.getCommunicationGroup() );
+      const int nproc = TNL::MPI::GetSize( distributedMatrix.getCommunicationGroup() );
       typename VectorType::ViewType subY( &y[ Partitioner::getOffset( matrix.getRows(), rank, nproc ) ],
                                           Partitioner::getSizeForRank( matrix.getRows(), rank, nproc ) );
       TNL_ASSERT_EQ( distributedY.getLocalView(), subY, "WRONG RESULT !!!" );
@@ -299,7 +292,7 @@ configSetup( Config::ConfigDescription & config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int
@@ -314,15 +307,15 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index a4c04578d5553f2f039a8e7fc575de0ad116c48d..c10c996e36c39448c632e33ba78f8153b5858c17 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -33,10 +33,10 @@ void barrier( const Matrix& matrix )
 {
 }
 
-template< typename Matrix, typename Communicator >
-void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix )
+template< typename Matrix >
+void barrier( const Matrices::DistributedMatrix< Matrix >& matrix )
 {
-   Communicator::Barrier( matrix.getCommunicationGroup() );
+   TNL::MPI::Barrier( matrix.getCommunicationGroup() );
 }
 
 template< typename Device >
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index e5a8d9819aa7e3c8fb31eecd62ba4932b6c1c731..3acfb2438c33539594cb3de6aa8f4cc429d21b06 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -24,9 +24,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
@@ -66,12 +65,6 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-#ifdef HAVE_MPI
-using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
-
 
 static const std::set< std::string > valid_solvers = {
    "gmres",
@@ -338,9 +331,9 @@ struct LinearSolversBenchmark
    using IndexType = typename MatrixType::IndexType;
    using VectorType = Containers::Vector< RealType, DeviceType, IndexType >;
 
-   using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >;
-   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using Partitioner = Containers::Partitioner< IndexType >;
+   using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
    using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector;
 
    static bool
@@ -388,7 +381,7 @@ struct LinearSolversBenchmark
       matrixPointer->getCompressedRowLengths( rowLengths );
       const IndexType maxRowLength = max( rowLengths );
 
-      const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" )
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       benchmark.setMetadataColumns( Benchmark::MetadataColumns({
@@ -413,13 +406,13 @@ struct LinearSolversBenchmark
          Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm );
          Matrices::reorderArray( x0, x0_perm, perm );
          Matrices::reorderArray( b, b_perm, perm );
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
          else
             runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm );
       }
       else {
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
          else
             runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b );
@@ -437,14 +430,14 @@ struct LinearSolversBenchmark
                    const VectorType& b )
    {
       // set up the distributed matrix
-      const auto group = CommunicatorType::AllGroup;
+      const auto group = TNL::MPI::AllGroup();
       const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group );
       SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group );
-      DistributedVector dist_x0( localRange, matrixPointer->getRows(), group );
-      DistributedVector dist_b( localRange, matrixPointer->getRows(), group );
+      DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group );
+      DistributedVector dist_b( localRange, 0, matrixPointer->getRows(), group );
 
       // copy the row capacities from the global matrix to the distributed matrix
-      DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group );
+      DistributedRowLengths distributedRowLengths( localRange, 0, matrixPointer->getRows(), group );
       for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
          const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
          distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi );
@@ -572,7 +565,7 @@ configSetup( Config::ConfigDescription& config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 
    config.addDelimiter( "Linear solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
@@ -597,14 +590,14 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp
index ab975ed078c470f4824d18e7848033e6fed73f2c..fcc8654bec5bbef012b2753843576a6889f2237c 100644
--- a/src/Benchmarks/ODESolvers/Euler.hpp
+++ b/src/Benchmarks/ODESolvers/Euler.hpp
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include "ComputeBlockResidue.h"
 
 namespace TNL {
@@ -202,7 +200,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
    }
 
    localResidue /= tau * ( RealType ) size;
-   Problem::CommunicatorType::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup );
+   TNL::MPI::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, TNL::MPI::AllGroup() );
    //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl;
 }
 
diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp
index c97bfc236b8db8f321874c92f23fe11ca9771e08..b45faa1b41d18cbc45eb9307587bfdfeb0c80c74 100644
--- a/src/Benchmarks/ODESolvers/Merson.hpp
+++ b/src/Benchmarks/ODESolvers/Merson.hpp
@@ -13,8 +13,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 
 #include "Merson.h"
 
@@ -187,13 +185,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u )
          time += currentTau;
          computeNewTimeLevel( time, currentTau, u, newResidue );
          this->setResidue( newResidue );
- 
+
          /****
           * When time is close to stopTime the new residue
           * may be inaccurate significantly.
           */
          if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
-         
+
 
          if( ! this->nextIteration() )
             return false;
@@ -209,7 +207,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u )
          currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
          TNLMPI::Bcast( currentTau, 1, 0 );
-#endif        
+#endif
       }
       if( time + currentTau > this->getStopTime() )
          currentTau = this->getStopTime() - time; //we don't want to keep such tau
@@ -405,7 +403,7 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con
       }
 #endif
    }
-   Problem::CommunicatorType::Allreduce( &eps, &maxEps, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+   TNL::MPI::Allreduce( &eps, &maxEps, 1, MPI_MAX, TNL::MPI::AllGroup() );
    return maxEps;
 }
 
@@ -467,7 +465,7 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time,
    }
 
    localResidue /= tau * ( RealType ) size;
-   Problem::CommunicatorType::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup);
+   TNL::MPI::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, TNL::MPI::AllGroup() );
 /*#ifdef USE_MPI
    TNLMPI::Allreduce( localResidue, currentResidue, 1, MPI_SUM);
 #else
diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h
index ff81fd18e4576672a89f35f54ff37eeed4ba9d86..65f769dda7b41157671bda431c4b1454e0934167 100644
--- a/src/Benchmarks/ODESolvers/SimpleProblem.h
+++ b/src/Benchmarks/ODESolvers/SimpleProblem.h
@@ -17,7 +17,7 @@
 
 namespace TNL {
    namespace Benchmarks {
-      
+
 template< typename Real = double,
    typename Device = Devices::Host,
    typename Index = int >
@@ -27,8 +27,7 @@ struct SimpleProblem
    using DeviceType = Device;
    using IndexType = Index;
    using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >;
-   using CommunicatorType = Communicators::NoDistrCommunicator;
-   
+
    template< typename VectorPointer >
    void getExplicitUpdate( const RealType& time,
       const RealType& tau,
@@ -45,10 +44,10 @@ struct SimpleProblem
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu );
    }
-   
+
    template< typename Vector >
    void applyBoundaryConditions( const RealType& t, Vector& u ) {};
-      
+
 };
 
    } // namespace Benchmarks
diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h
index 60b5336639c9289bb39c8240d48d2b3121487210..a6ee67a624a01443eeabb12d540fc7d6cecb58d8 100644
--- a/src/Benchmarks/ODESolvers/benchmarks.h
+++ b/src/Benchmarks/ODESolvers/benchmarks.h
@@ -16,8 +16,6 @@
 #include <TNL/Config/ParameterContainer.h>
 
 #include "../Benchmarks.h"
-#include "SimpleProblem.h"
-
 
 #include <stdexcept>  // std::runtime_error
 
@@ -35,31 +33,6 @@ getPerformer()
    return "CPU";
 }
 
-/*template< typename Matrix >
-void barrier( const Matrix& matrix )
-{
-}
-
-template< typename Matrix, typename Communicator >
-void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix )
-{
-   Communicator::Barrier( matrix.getCommunicationGroup() );
-}*/
-
-template< typename Device >
-bool checkDevice( const Config::ParameterContainer& parameters )
-{
-   const String device = parameters.getParameter< String >( "device" );
-   if( device == "all" )
-      return true;
-   if( std::is_same< Device, Devices::Host >::value && device == "host" )
-      return true;
-   if( std::is_same< Device, Devices::Cuda >::value && device == "cuda" )
-      return true;
-   return false;
-}
-
-
 template< typename Solver, typename VectorPointer >
 void
 benchmarkSolver( Benchmark& benchmark,
@@ -90,7 +63,7 @@ benchmarkSolver( Benchmark& benchmark,
    auto compute = [&]() {
       solver.solve( u );
    };
-   
+
    // subclass BenchmarkResult to add extra columns to the benchmark
    // (iterations, preconditioned residue, true residue)
    /*struct MyBenchmarkResult : public BenchmarkResult
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index bbde8894518baaecf0b89d3ded7667db422de73d..0d8d3c04e6fdc2ba4c00ccbca254a737e432af53 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -23,9 +23,8 @@
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Solvers/ODE/Euler.h>
 #include <TNL/Solvers/ODE/Merson.h>
 
@@ -39,12 +38,6 @@ using namespace TNL;
 using namespace TNL::Benchmarks;
 using namespace TNL::Pointers;
 
-#ifdef HAVE_MPI
-using CommunicatorType = Communicators::MpiCommunicator;
-#else
-using CommunicatorType = Communicators::NoDistrCommunicator;
-#endif
-
 
 template< typename Real, typename Index >
 void
@@ -68,7 +61,7 @@ benchmarkODESolvers( Benchmark& benchmark,
 #ifdef HAVE_CUDA
       CudaVectorPointer cuda_u( dofs );
       *cuda_u = 0.0;
-#endif      
+#endif
       if( solver == "euler" || solver == "all" ) {
          using HostSolver = Solvers::ODE::Euler< HostProblem, SolverMonitorType >;
          benchmark.setOperation("Euler");
@@ -118,7 +111,7 @@ struct ODESolversBenchmark
         Benchmark::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
-      const String name = String( (CommunicatorType::isDistributed()) ? "Distributed ODE solvers" : "ODE solvers" );
+      const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" );
                           //+ " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) {
@@ -127,7 +120,7 @@ struct ODESolversBenchmark
             { "DOFs", convertToString( dofs ) },
          } ));
 
-         if( CommunicatorType::isDistributed() )
+         if( TNL::MPI::GetSize() > 1 )
             runDistributed( benchmark, metadata, parameters, dofs );
          else
             runNonDistributed( benchmark, metadata, parameters, dofs );
@@ -141,7 +134,7 @@ struct ODESolversBenchmark
                    const Config::ParameterContainer& parameters,
                    size_t dofs )
    {
-      //const auto group = CommunicatorType::AllGroup;
+      //const auto group = TNL::MPI::AllGroup();
 
       std::cout << "Iterative solvers:" << std::endl;
       benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs );
@@ -173,10 +166,10 @@ bool resolveRealTypes( Benchmark& benchmark,
    Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
-   if( ( realType == "float" || realType == "all" ) && 
+   if( ( realType == "float" || realType == "all" ) &&
        ! resolveIndexType< float >( benchmark, metadata, parameters ) )
       return false;
-   if( ( realType == "double" || realType == "all" ) && 
+   if( ( realType == "double" || realType == "all" ) &&
        ! resolveIndexType< double >( benchmark, metadata, parameters ) )
       return false;
    return true;
@@ -209,7 +202,7 @@ configSetup( Config::ConfigDescription& config )
    config.addDelimiter( "Device settings:" );
    Devices::Host::configSetup( config );
    Devices::Cuda::configSetup( config );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 
    config.addDelimiter( "ODE solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
@@ -230,14 +223,14 @@ main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
-   const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
+   const int rank = TNL::MPI::GetRank();
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! CommunicatorType::setup( parameters ) )
+       ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String & logFileName = parameters.getParameter< String >( "log-file" );
diff --git a/src/Python/pytnl/CMakeLists.txt b/src/Python/pytnl/CMakeLists.txt
index 2065b0a13b8a9bd42da4deabafa9ba2b371fc756..15b8e6b0a4e56b1e663af2c27d7de9f8480fc275 100644
--- a/src/Python/pytnl/CMakeLists.txt
+++ b/src/Python/pytnl/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_subdirectory( tnl )
+if( BUILD_MPI )
+   add_subdirectory( tnl_mpi )
+endif()
 
 install( DIRECTORY . DESTINATION "include/pytnl"
          MESSAGE_NEVER
diff --git a/src/Python/pytnl/iostream_caster.h b/src/Python/pytnl/iostream_caster.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f5d4e16c0c33f87028bebbd5b5e56548670c7d
--- /dev/null
+++ b/src/Python/pytnl/iostream_caster.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cctbx/pystreambuf.h>
+
+namespace pybind11 { namespace detail {
+    template <> struct type_caster<std::istream> {
+    public:
+        bool load(handle src, bool) {
+            if (getattr(src, "read", none()).is_none()){
+              return false;
+            }
+
+            obj = reinterpret_borrow<object>(src);
+            value = std::unique_ptr<pystreambuf::istream>(new pystreambuf::istream(obj, 0));
+
+            return true;
+        }
+
+    protected:
+        object obj;
+        std::unique_ptr<pystreambuf::istream> value;
+
+    public:
+        static constexpr auto name = _("istream");
+        static handle cast(const std::istream *src, return_value_policy policy, handle parent) {
+            return none().release();
+        }
+        operator std::istream*() { return value.get(); }
+        operator std::istream&() { return *value; }
+        template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    };
+
+    template <> struct type_caster<std::ostream> {
+    public:
+        bool load(handle src, bool) {
+            if (getattr(src, "write", none()).is_none()){
+              return false;
+            }
+
+            obj = reinterpret_borrow<object>(src);
+            value = std::unique_ptr<pystreambuf::ostream>(new pystreambuf::ostream(obj, 0));
+
+            return true;
+        }
+
+    protected:
+        object obj;
+        std::unique_ptr<pystreambuf::ostream> value;
+
+    public:
+        static constexpr auto name = _("ostream");
+        static handle cast(const std::ostream *src, return_value_policy policy, handle parent) {
+            return none().release();
+        }
+        operator std::ostream*() { return value.get(); }
+        operator std::ostream&() { return *value; }
+        template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    };
+}} // namespace pybind11::detail
diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt
index c7fcd80e24480e3fd8eb21cc8d807dc900ea1d66..dc1c3fcc34f5be21f9f9aba34b567edcf5c06ee8 100644
--- a/src/Python/pytnl/tnl/CMakeLists.txt
+++ b/src/Python/pytnl/tnl/CMakeLists.txt
@@ -6,15 +6,23 @@ set( sources
       Grid2D.cpp
       Grid3D.cpp
       Mesh.cpp
+      MeshReaders.cpp
+      MeshWriters.cpp
       Object.cpp
       SparseMatrix.cpp
       String.cpp
+      VTKTraits.cpp
       tnl.cpp
 )
 pybind11_add_module( pytnl ${sources} )
 
 # rename the shared library to tnl.cpython-XXm-x86_64-linux-gnu.so
-set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl )
+set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl DEBUG_POSTFIX "_dbg" )
+
+# indicate the postfix to the target so that the pybind11 module name can be set accordingly
+if( CMAKE_BUILD_TYPE STREQUAL "Debug")
+   target_compile_options( pytnl PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg )
+endif()
 
 # Skip -march=native -mtune=native for pytnl - optimizing python bindings for
 # a specific architecture is not very useful and prevents using Python tools on
diff --git a/src/Python/pytnl/tnl/EntityTypes.h b/src/Python/pytnl/tnl/EntityTypes.h
deleted file mode 100644
index 1f10e2827dd2cc24d006c88f509e6b8d5a5cbf90..0000000000000000000000000000000000000000
--- a/src/Python/pytnl/tnl/EntityTypes.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-
-enum class EntityTypes { Cell, Face, Vertex };
-
-inline void
-export_EntityTypes( py::module & m )
-{
-    // avoid duplicate conversion -> export only once
-    static bool exported = false;
-    if( ! exported ) {
-        // TODO: should be nested types instead
-        py::enum_< EntityTypes >( m, "EntityTypes" )
-            .value("Cell", EntityTypes::Cell)
-            .value("Face", EntityTypes::Face)
-            .value("Vertex", EntityTypes::Vertex)
-        ;
-        exported = true;
-    }
-}
-
-template< typename Mesh >
-typename Mesh::GlobalIndexType
-mesh_getEntitiesCount( const Mesh & self, const EntityTypes & entity )
-{
-    if( entity == EntityTypes::Cell )
-        return self.template getEntitiesCount< typename Mesh::Cell >();
-    else if( entity == EntityTypes::Face )
-        return self.template getEntitiesCount< typename Mesh::Face >();
-    else if( entity == EntityTypes::Vertex )
-        return self.template getEntitiesCount< typename Mesh::Vertex >();
-    else
-        throw py::value_error("The entity parameter must be either Cell, Face or Vertex.");
-}
diff --git a/src/Python/pytnl/tnl/Grid.h b/src/Python/pytnl/tnl/Grid.h
index 8cf28a8f5bd393dfda5bfc01b6547c77ad66ba91..2622bd5c93dc07a5d67bfca2dde02ef72fdca6c7 100644
--- a/src/Python/pytnl/tnl/Grid.h
+++ b/src/Python/pytnl/tnl/Grid.h
@@ -5,7 +5,7 @@ namespace py = pybind11;
 
 #include "StaticVector.h"
 #include "Grid_getSpaceStepsProducts.h"
-#include "EntityTypes.h"
+#include "mesh_getters.h"
 
 #include <type_traits>
 
@@ -54,8 +54,6 @@ void export_Grid( py::module & m, const char* name )
 //    void (Grid::* _setDimensions1)(const IndexType) = &Grid::setDimensions;
     void (Grid::* _setDimensions2)(const typename Grid::CoordinatesType &) = &Grid::setDimensions;
 
-    export_EntityTypes(m);
-
     auto grid = py::class_<Grid, TNL::Object>( m, name )
         .def(py::init<>())
         .def_static("getMeshDimension", &Grid::getMeshDimension)
@@ -68,11 +66,13 @@ void export_Grid( py::module & m, const char* name )
         .def("setDomain", &Grid::setDomain)
         .def("getOrigin", &Grid::getOrigin, py::return_value_policy::reference_internal)
         .def("getProportions", &Grid::getProportions, py::return_value_policy::reference_internal)
-        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid >)
-        // TODO: if combined, the return type would depend on the runtime parameter (entity)
-        .def("getEntity_cell", &Grid::template getEntity<typename Grid::Cell>)
-        .def("getEntity_face", &Grid::template getEntity<typename Grid::Face>)
-        .def("getEntity_vertex", &Grid::template getEntity<typename Grid::Vertex>)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Cell >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Face >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Vertex >)
+        // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity)
+        .def("getCell", &Grid::template getEntity<typename Grid::Cell>)
+        .def("getFace", &Grid::template getEntity<typename Grid::Face>)
+        .def("getVertex", &Grid::template getEntity<typename Grid::Vertex>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Cell>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Face>)
         .def("getEntityIndex", &Grid::template getEntityIndex<typename Grid::Vertex>)
diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp
index aa0c8c0355363aeeeb0e7123cae82da3ea005dbf..48e3f939b8b656a7ab023857917377f38b8a974b 100644
--- a/src/Python/pytnl/tnl/Mesh.cpp
+++ b/src/Python/pytnl/tnl/Mesh.cpp
@@ -2,35 +2,12 @@
 #include "../tnl_conversions.h"
 
 #include "Mesh.h"
-#include <TNL/Meshes/Readers/VTKReader.h>
-#include <TNL/Meshes/Readers/VTUReader.h>
-
-template< typename Reader >
-void export_reader( py::module & m, const char* name )
-{
-    py::class_< Reader >( m, name )
-        .def(py::init<std::string>())
-        .def("loadMesh", &Reader::template loadMesh< MeshOfEdges >)
-        .def("loadMesh", &Reader::template loadMesh< MeshOfTriangles >)
-        .def("loadMesh", &Reader::template loadMesh< MeshOfTetrahedrons >)
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfEdges & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTriangles & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-//        .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTetrahedrons & mesh ) {
-//                return reader.loadMesh( name.c_str(), mesh );
-//            } )
-    ;
-}
 
 void export_Meshes( py::module & m )
 {
     export_Mesh< MeshOfEdges >( m, "MeshOfEdges" );
     export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" );
+    export_Mesh< MeshOfQuadrangles >( m, "MeshOfQuadrangles" );
     export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" );
-
-    export_reader< TNL::Meshes::Readers::VTKReader >( m, "VTKReader" );
-    export_reader< TNL::Meshes::Readers::VTUReader >( m, "VTUReader" );
+    export_Mesh< MeshOfHexahedrons >( m, "MeshOfHexahedrons" );
 }
diff --git a/src/Python/pytnl/tnl/Mesh.h b/src/Python/pytnl/tnl/Mesh.h
index 21fa015fc94d967146cc2e8f046ac0f70fcbdb39..3097f111f528fb06aef62f9d46af611a33509b6d 100644
--- a/src/Python/pytnl/tnl/Mesh.h
+++ b/src/Python/pytnl/tnl/Mesh.h
@@ -5,7 +5,7 @@ namespace py = pybind11;
 
 #include "../typedefs.h"
 #include "StaticVector.h"
-#include "EntityTypes.h"
+#include "mesh_getters.h"
 
 #include <TNL/String.h>
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
@@ -82,8 +82,11 @@ template< typename MeshEntity, typename Scope >
 void export_MeshEntity( Scope & scope, const char* name )
 {
     auto entity = py::class_< MeshEntity >( scope, name )
+//        .def(py::init<>())
+//        .def(py::init<typename MeshEntity::MeshType, typename MeshEntity::GlobalIndexType>())
         .def_static("getEntityDimension", &MeshEntity::getEntityDimension)
         .def("getIndex", &MeshEntity::getIndex)
+        .def("getTag", &MeshEntity::getTag)
         // TODO
     ;
 
@@ -95,23 +98,24 @@ void export_MeshEntity( Scope & scope, const char* name )
 template< typename Mesh >
 void export_Mesh( py::module & m, const char* name )
 {
-    // there are two templates - const and non-const - take only the const
-    auto (Mesh::* getEntity_cell)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Cell>;
-    auto (Mesh::* getEntity_face)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Face>;
-    auto (Mesh::* getEntity_vertex)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity<typename Mesh::Vertex>;
-
-    export_EntityTypes(m);
-
     auto mesh = py::class_< Mesh, TNL::Object >( m, name )
         .def(py::init<>())
         .def_static("getMeshDimension", &Mesh::getMeshDimension)
         .def_static("getSerializationType", &Mesh::getSerializationType)
         .def("getSerializationTypeVirtual", &Mesh::getSerializationTypeVirtual)
-        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh >)
-        // TODO: if combined, the return type would depend on the runtime parameter (entity)
-        .def("getEntity_cell", getEntity_cell)
-        .def("getEntity_face", getEntity_face)
-        .def("getEntity_vertex", getEntity_vertex)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Cell >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Face >)
+        .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Vertex >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Cell >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Face >)
+        .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Vertex >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Cell >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Face >)
+        .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Vertex >)
+        // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity)
+        .def("getCell", &Mesh::template getEntity<typename Mesh::Cell>)
+        .def("getFace", &Mesh::template getEntity<typename Mesh::Face>)
+        .def("getVertex", &Mesh::template getEntity<typename Mesh::Vertex>)
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ return getEntityCenter( mesh, cell ); } )
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Face& face ){ return getEntityCenter( mesh, face ); } )
         .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return getEntityCenter( mesh, vertex ); } )
@@ -124,6 +128,12 @@ void export_Mesh( py::module & m, const char* name )
                                        return mesh.template isBoundaryEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } )
         .def("isBoundaryEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){
                                         return mesh.template isBoundaryEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Cell& cell ){
+                                       return mesh.template isGhostEntity< Mesh::Cell::getEntityDimension() >( cell.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Face& face ){
+                                       return mesh.template isGhostEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } )
+        .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){
+                                        return mesh.template isGhostEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } )
         // TODO: more?
     ;
 
diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4abae015de76ee4ff440871ed57752f13a7ad79
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshReaders.cpp
@@ -0,0 +1,39 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "MeshReaders.h"
+#include "../typedefs.h"
+
+void export_MeshReaders( py::module & m )
+{
+    using MeshReader = TNL::Meshes::Readers::MeshReader;
+    using XMLVTK = TNL::Meshes::Readers::XMLVTK;
+
+    // base class with trampolines for virtual methods
+    py::class_< MeshReader, PyMeshReader >( m, "MeshReader" )
+        .def(py::init<std::string>())
+        // bindings against the actual class, NOT the trampoline
+        .def("reset", &MeshReader::reset)
+        .def("detectMesh", &MeshReader::detectMesh)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfQuadrangles >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >)
+        .def("loadMesh", &MeshReader::template loadMesh< MeshOfHexahedrons >)
+    ;
+
+    py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" )
+        .def(py::init<std::string>())
+    ;
+
+    // base class for VTUReader and PVTUReader
+    py::class_< XMLVTK, PyXMLVTK, MeshReader >( m, "XMLVTK" )
+        .def(py::init<std::string>())
+        .def("readPointData", &XMLVTK::readPointData)
+        .def("readCellData", &XMLVTK::readCellData)
+   ;
+
+    py::class_< TNL::Meshes::Readers::VTUReader, XMLVTK >( m, "VTUReader" )
+        .def(py::init<std::string>())
+    ;
+}
diff --git a/src/Python/pytnl/tnl/MeshReaders.h b/src/Python/pytnl/tnl/MeshReaders.h
new file mode 100644
index 0000000000000000000000000000000000000000..22b40a6719319251168ad9dfe0922071bd2b9d91
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshReaders.h
@@ -0,0 +1,47 @@
+#include <TNL/Meshes/Readers/VTKReader.h>
+#include <TNL/Meshes/Readers/VTUReader.h>
+
+// trampoline classes needed for overriding virtual methods
+// https://pybind11.readthedocs.io/en/stable/advanced/classes.html
+
+class PyMeshReader
+: public TNL::Meshes::Readers::MeshReader
+{
+   using Parent = TNL::Meshes::Readers::MeshReader;
+
+public:
+   // inherit constructors
+   using TNL::Meshes::Readers::MeshReader::MeshReader;
+
+   // trampolines (one for each virtual method)
+   void reset() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, reset );
+   }
+
+   void detectMesh() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh );
+   }
+};
+
+class PyXMLVTK
+: public TNL::Meshes::Readers::XMLVTK
+{
+   using Parent = TNL::Meshes::Readers::XMLVTK;
+
+public:
+   // inherit constructors
+   using TNL::Meshes::Readers::XMLVTK::XMLVTK;
+
+   // trampolines (one for each virtual method)
+   void reset() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, reset );
+   }
+
+   void detectMesh() override
+   {
+      PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh );
+   }
+};
diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01f79ce2d6a8a3b1f4af2773ac27653148ec4fb3
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshWriters.cpp
@@ -0,0 +1,99 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "MeshWriters.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/MeshReader.h>
+
+#include <TNL/Meshes/Writers/VTKWriter.h>
+#include <TNL/Meshes/Writers/VTUWriter.h>
+
+template< typename Writer, TNL::Meshes::VTK::FileFormat default_format >
+void export_MeshWriter( py::module & m, const char* name )
+{
+    // We cannot use MeshReader::VariantVector for Python bindings, because its variants are
+    // std::vector<T> for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t,
+    // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map
+    // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given
+    // Python value fits into the C++ type when selecting the alternative for a scalar type, and
+    // for containers like std::vector it merely selects the first possible type. For reference, see
+    // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161
+    using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >;
+
+    // Binding to Writer directly is not possible, because the writer has a std::ostream attribute
+    // which would reference the streambuf created by the type caster from the Python file-like object.
+    // However, the streambuf would be destroyed as soon as the writer is constructed and control
+    // returned to Python, so the following invokations would use an invalid object and segfault.
+    // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own
+    // ostream attribute and is initialized by a py::object to avoid type casting.
+    using PythonWriter = PyWriter< Writer, default_format >;
+    py::class_< PythonWriter >( m, name )
+        .def(py::init<py::object, TNL::Meshes::VTK::FileFormat>(), py::keep_alive<1, 2>(),
+              py::arg("stream"), py::pos_only(), py::arg("format") = default_format)
+        .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1)
+        .def("writeVertices", &Writer::template writeEntities< 0 >)
+        .def("writeCells", &Writer::template writeEntities<>)
+        // we use the VariantVector from MeshReader because we already have a caster for it
+        .def("writePointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writePointData( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writeCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writeCellData( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writeDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       // we need a view for the std::vector
+                       using vector_t = std::decay_t<decltype(array)>;
+                       using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >;
+                       view_t view( array.data(), array.size() );
+                       writer.writeDataArray( view, name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+    ;
+}
+
+void export_MeshWriters( py::module & m )
+{
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid1D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid1D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid2D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid2D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_Grid3D" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid3D" );
+
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfEdges" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >,        TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTriangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >,    TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfQuadrangles >,  TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfQuadrangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfQuadrangles >,  TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfQuadrangles" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfTetrahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfHexahedrons >,  TNL::Meshes::VTK::FileFormat::binary          >( m, "VTKWriter_MeshOfHexahedrons" );
+    export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfHexahedrons >,  TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfHexahedrons" );
+}
diff --git a/src/Python/pytnl/tnl/MeshWriters.h b/src/Python/pytnl/tnl/MeshWriters.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dd7185eadf48c11a4fa484e1e7327e32f92566f
--- /dev/null
+++ b/src/Python/pytnl/tnl/MeshWriters.h
@@ -0,0 +1,22 @@
+#include "../iostream_caster.h"
+#include <TNL/Meshes/VTKTraits.h>
+
+// helper struct is needed to ensure correct initialization order in the PyWriter constructor
+struct PyOstreamHelper
+{
+   py::object obj;
+   pystreambuf::ostream str;
+
+   PyOstreamHelper( py::object src )
+      : obj(py::reinterpret_borrow<py::object>(src)),
+        str(obj)
+   {}
+};
+
+template< typename Writer, TNL::Meshes::VTK::FileFormat default_format >
+struct PyWriter : public PyOstreamHelper, public Writer
+{
+   PyWriter( py::object src, TNL::Meshes::VTK::FileFormat format = default_format )
+   : PyOstreamHelper(src), Writer(str)
+   {}
+};
diff --git a/src/Python/pytnl/tnl/VTKTraits.cpp b/src/Python/pytnl/tnl/VTKTraits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..85d7964714971d4ef62eda867c5ca84bab1dfbde
--- /dev/null
+++ b/src/Python/pytnl/tnl/VTKTraits.cpp
@@ -0,0 +1,45 @@
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+#include <TNL/Meshes/VTKTraits.h>
+
+void export_VTKTraits( py::module & m )
+{
+    py::enum_< TNL::Meshes::VTK::FileFormat >( m, "VTKFileFormat")
+       .value("ascii", TNL::Meshes::VTK::FileFormat::ascii)
+       .value("binary", TNL::Meshes::VTK::FileFormat::binary)
+       .value("zlib_compressed", TNL::Meshes::VTK::FileFormat::zlib_compressed)
+    ;
+    py::enum_< TNL::Meshes::VTK::DataType >( m, "VTKDataType")
+       .value("CellData", TNL::Meshes::VTK::DataType::CellData)
+       .value("PointData", TNL::Meshes::VTK::DataType::PointData)
+    ;
+    py::enum_< TNL::Meshes::VTK::EntityShape >( m, "VTKEntityShape")
+       .value("Vertex", TNL::Meshes::VTK::EntityShape::Vertex)
+       .value("PolyVertex", TNL::Meshes::VTK::EntityShape::PolyVertex)
+       .value("Line", TNL::Meshes::VTK::EntityShape::Line)
+       .value("PolyLine", TNL::Meshes::VTK::EntityShape::PolyLine)
+       .value("Triangle", TNL::Meshes::VTK::EntityShape::Triangle)
+       .value("TriangleStrip", TNL::Meshes::VTK::EntityShape::TriangleStrip)
+       .value("Polygon", TNL::Meshes::VTK::EntityShape::Polygon)
+       .value("Pixel", TNL::Meshes::VTK::EntityShape::Pixel)
+       .value("Quad", TNL::Meshes::VTK::EntityShape::Quad)
+       .value("Tetra", TNL::Meshes::VTK::EntityShape::Tetra)
+       .value("Voxel", TNL::Meshes::VTK::EntityShape::Voxel)
+       .value("Hexahedron", TNL::Meshes::VTK::EntityShape::Hexahedron)
+       .value("Wedge", TNL::Meshes::VTK::EntityShape::Wedge)
+       .value("Pyramid", TNL::Meshes::VTK::EntityShape::Pyramid)
+    ;
+    py::enum_< TNL::Meshes::VTK::CellGhostTypes >( m, "VTKCellGhostTypes")
+       .value("DUPLICATECELL", TNL::Meshes::VTK::CellGhostTypes::DUPLICATECELL,               "the cell is present on multiple processors")
+       .value("HIGHCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::HIGHCONNECTIVITYCELL, "the cell has more neighbors than in a regular mesh")
+       .value("LOWCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::LOWCONNECTIVITYCELL,   "the cell has less neighbors than in a regular mesh")
+       .value("REFINEDCELL", TNL::Meshes::VTK::CellGhostTypes::REFINEDCELL,                   "other cells are present that refines it")
+       .value("EXTERIORCELL", TNL::Meshes::VTK::CellGhostTypes::EXTERIORCELL,                 "the cell is on the exterior of the data set")
+       .value("HIDDENCELL", TNL::Meshes::VTK::CellGhostTypes::HIDDENCELL,                     "the cell is needed to maintain connectivity, but the data values should be ignored")
+    ;
+    py::enum_< TNL::Meshes::VTK::PointGhostTypes >( m, "VTKPointGhostTypes")
+       .value("DUPLICATEPOINT", TNL::Meshes::VTK::PointGhostTypes::DUPLICATEPOINT, "the cell is present on multiple processors")
+       .value("HIDDENPOINT", TNL::Meshes::VTK::PointGhostTypes::HIDDENPOINT,       "the point is needed to maintain connectivity, but the data values should be ignored")
+    ;
+}
diff --git a/src/Python/pytnl/tnl/mesh_getters.h b/src/Python/pytnl/tnl/mesh_getters.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5eddaa5ea3a3f62ad4d40ab3b01b96f5a798aa6
--- /dev/null
+++ b/src/Python/pytnl/tnl/mesh_getters.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <type_traits>
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getEntitiesCount( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getEntitiesCount< EntityType::getEntityDimension() >();
+}
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getGhostEntitiesCount( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getGhostEntitiesCount< EntityType::getEntityDimension() >();
+}
+
+template< typename Mesh, typename EntityType >
+typename Mesh::GlobalIndexType
+mesh_getGhostEntitiesOffset( const Mesh & self, const EntityType & entity )
+{
+    static_assert( std::is_same< EntityType, typename Mesh::Cell >::value ||
+                   std::is_same< EntityType, typename Mesh::Face >::value ||
+                   std::is_same< EntityType, typename Mesh::Vertex >::value,
+                   "incompatible entity type" );
+    return self.template getGhostEntitiesOffset< EntityType::getEntityDimension() >();
+}
diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp
index 0eb7c3e8b0aee48791a546e36291f4790c9ca8a9..65e9c14e47a153cc9d724c0df5ecff443bb65fb3 100644
--- a/src/Python/pytnl/tnl/tnl.cpp
+++ b/src/Python/pytnl/tnl/tnl.cpp
@@ -13,7 +13,10 @@ void export_String( py::module & m );
 void export_Grid1D( py::module & m );
 void export_Grid2D( py::module & m );
 void export_Grid3D( py::module & m );
+void export_VTKTraits( py::module & m );
 void export_Meshes( py::module & m );
+void export_MeshReaders( py::module & m );
+void export_MeshWriters( py::module & m );
 void export_SparseMatrices( py::module & m );
 
 template< typename T >
@@ -23,7 +26,7 @@ template< typename T >
 using _vector = TNL::Containers::Vector< T, TNL::Devices::Host, IndexType >;
 
 // Python module definition
-PYBIND11_MODULE(tnl, m)
+PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m)
 {
     register_exceptions(m);
 
@@ -41,7 +44,11 @@ PYBIND11_MODULE(tnl, m)
     export_Grid2D(m);
     export_Grid3D(m);
 
+    export_VTKTraits(m);
+
     export_Meshes(m);
+    export_MeshReaders(m);
+    export_MeshWriters(m);
 
     export_SparseMatrices(m);
 }
diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h
index 602d1cffd1ccd32660b72630c023f2d913f1da19..788a54813fce26d2021d5c8671e5721a49678585 100644
--- a/src/Python/pytnl/tnl_conversions.h
+++ b/src/Python/pytnl/tnl_conversions.h
@@ -1,3 +1,5 @@
 // conversion has to be registered for each object file
 #include "tnl_str_conversion.h"
 #include "tnl_tuple_conversion.h"
+#include "variant_caster.h"
+#include "iostream_caster.h"
diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2aa8f73dacd05197b0bfbc04e633d267167ed8d6
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt
@@ -0,0 +1,60 @@
+# enable C++14 for pytnl_mpi (due to py::overload_cast)
+set(PYBIND11_CPP_STANDARD -std=c++14)
+
+set( sources
+      DistributedMesh.cpp
+      DistributedMeshReaders.cpp
+      DistributedMeshWriters.cpp
+      tnl_mpi.cpp
+)
+pybind11_add_module( pytnl_mpi ${sources} )
+
+# rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so
+set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi DEBUG_POSTFIX "_dbg" )
+
+# indicate the postfix to the target so that the pybind11 module name can be set accordingly
+if( CMAKE_BUILD_TYPE STREQUAL "Debug")
+   target_compile_options( pytnl_mpi PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg )
+endif()
+
+# Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for
+# a specific architecture is not very useful and prevents using Python tools on
+# hybrid clusters.
+get_target_property( pytnl_mpi_COMPILE_OPTIONS pytnl_mpi COMPILE_OPTIONS )
+if( pytnl_mpi_COMPILE_OPTIONS )
+   string( REPLACE "-march=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+   string( REPLACE "-mtune=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+   set_target_properties( pytnl_mpi PROPERTIES COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" )
+endif()
+
+# We have bindings for unsafe objects (e.g. Array::operator[]) where assertion
+# is the only safeguard, so we need to translate the TNL::AssertionError to
+# Python's AssertionError.
+# NDEBUG is defined in the global CMAKE_CXX_FLAGS and cannot be easily removed
+# per-target, so we need to undefine it by passing -U NDEBUG.
+target_compile_options( pytnl_mpi PRIVATE -U NDEBUG -D TNL_THROW_ASSERTION_ERROR )
+
+# disable errors due to -Wunused-value coming from pybind11
+if( ${WITH_CI_FLAGS} )
+   if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      target_compile_options( pytnl_mpi PRIVATE -Wno-error=unused-value )
+   endif()
+endif()
+
+
+# enable zlib and tinyxml2 (used by PVTUReader)
+find_package( ZLIB )
+if( ZLIB_FOUND )
+   target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_ZLIB")
+   target_include_directories(pytnl_mpi PUBLIC ${ZLIB_INCLUDE_DIRS})
+   target_link_libraries(pytnl_mpi PUBLIC ${ZLIB_LIBRARIES})
+endif()
+
+find_package( tinyxml2 QUIET )
+if( tinyxml2_FOUND )
+   target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_TINYXML2")
+   target_link_libraries(pytnl_mpi PUBLIC tinyxml2::tinyxml2)
+endif()
+
+
+install( TARGETS pytnl_mpi DESTINATION ${PYTHON_SITE_PACKAGES_DIR} )
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0af175f3cd24792a4581c33b7349a1b3f34ab07f
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp
@@ -0,0 +1,22 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../typedefs.h"
+#include "DistributedMesh.h"
+#include "../tnl/Array.h"
+
+void export_DistributedMeshes( py::module & m )
+{
+    // make sure that bindings for the local meshes are available
+    py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl)));
+
+    export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" );
+    export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" );
+    export_DistributedMesh< DistributedMeshOfQuadrangles >( m, "DistributedMeshOfQuadrangles" );
+    export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" );
+    export_DistributedMesh< DistributedMeshOfHexahedrons >( m, "DistributedMeshOfHexahedrons" );
+
+    // export VTKTypesArrayType
+    using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType;
+    export_Array< VTKTypesArrayType >(m, "VTKTypesArrayType");
+}
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.h b/src/Python/pytnl/tnl_mpi/DistributedMesh.h
new file mode 100644
index 0000000000000000000000000000000000000000..64afe5978dc4d82d20caa1d484640ecedeba030f
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+template< typename Mesh >
+void export_DistributedMesh( py::module & m, const char* name )
+{
+    auto mesh = py::class_< Mesh >( m, name )
+        .def(py::init<>())
+        .def_static("getMeshDimension", &Mesh::getMeshDimension)
+//        .def("setCommunicationGroup", &Mesh::setCommunicationGroup)
+//        .def("getCommunicationGroup", &Mesh::getCommunicationGroup)
+        .def("getLocalMesh", py::overload_cast<>(&Mesh::getLocalMesh), py::return_value_policy::reference_internal)
+        .def("setGhostLevels", &Mesh::setGhostLevels)
+        .def("getGhostLevels", &Mesh::getGhostLevels)
+        .def("getGlobalPointIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& {
+                return mesh.template getGlobalIndices< 0 >();
+            },
+            py::return_value_policy::reference_internal)
+        .def("getGlobalCellIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& {
+                return mesh.template getGlobalIndices< Mesh::getMeshDimension() >();
+            },
+            py::return_value_policy::reference_internal)
+        .def("vtkPointGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& {
+                return mesh.vtkPointGhostTypes();
+            },
+            py::return_value_policy::reference_internal)
+        .def("vtkCellGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& {
+                return mesh.vtkCellGhostTypes();
+            },
+            py::return_value_policy::reference_internal)
+    ;
+}
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c196a67cc4576b2865b9257a97c3986f028e983d
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp
@@ -0,0 +1,26 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../tnl/MeshReaders.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/PVTUReader.h>
+
+void export_DistributedMeshReaders( py::module & m )
+{
+    using XMLVTK = TNL::Meshes::Readers::XMLVTK;
+    using PVTUReader = TNL::Meshes::Readers::PVTUReader;
+
+    // make sure that bindings for the parent class are available
+    py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl)));
+
+    py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" )
+        .def(py::init<std::string>())
+        // loadMesh is not virtual in PVTUReader
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfQuadrangles >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >)
+        .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfHexahedrons >)
+    ;
+}
diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17bf57c128dabcb28793fa0bf831150929624abd
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp
@@ -0,0 +1,95 @@
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+
+#include "../tnl/MeshWriters.h"
+#include "../typedefs.h"
+
+#include <TNL/Meshes/Readers/MeshReader.h>
+
+#include <TNL/Meshes/Writers/PVTUWriter.h>
+
+template< template<typename> class WriterTemplate, typename LocalMesh, TNL::Meshes::VTK::FileFormat default_format >
+void export_DistributedMeshWriter( py::module & m, const char* name )
+{
+    using Writer = WriterTemplate< LocalMesh >;
+    using Mesh = TNL::Meshes::DistributedMeshes::DistributedMesh< LocalMesh >;
+
+    // We cannot use MeshReader::VariantVector for Python bindings, because its variants are
+    // std::vector<T> for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t,
+    // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map
+    // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given
+    // Python value fits into the C++ type when selecting the alternative for a scalar type, and
+    // for containers like std::vector it merely selects the first possible type. For reference, see
+    // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161
+    using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >;
+
+    // Binding to Writer directly is not possible, because the writer has a std::ostream attribute
+    // which would reference the streambuf created by the type caster from the Python file-like object.
+    // However, the streambuf would be destroyed as soon as the writer is constructed and control
+    // returned to Python, so the following invokations would use an invalid object and segfault.
+    // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own
+    // ostream attribute and is initialized by a py::object to avoid type casting.
+    using PythonWriter = PyWriter< Writer, default_format >;
+    py::class_< PythonWriter >( m, name )
+        .def(py::init<py::object, TNL::Meshes::VTK::FileFormat>(), py::keep_alive<1, 2>(),
+              py::arg("stream"), py::pos_only(), py::arg("format") = default_format)
+        .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1)
+        .def("writeVertices", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities< 0 >),
+              py::arg("distributedMesh"))
+        .def("writeVertices", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities< 0 >),
+              py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0)
+        .def("writeCells", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities<>),
+              py::arg("distributedMesh"))
+        .def("writeCells", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities<>),
+              py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0)
+        // INCONSISTENCY: the C++ methods writePPointData, writePCellData, writePDataArray do not
+        // take the whole array as parameter, only the ValueType as a template parameter. Since
+        // this does not map nicely to Python, we pass the whole array just like in the
+        // VTKWriter and VTUWriter classes.
+        // we use the VariantVector from MeshReader because we already have a caster for it
+        .def("writePPointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePPointData< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writePCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePCellData< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        .def("writePDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) {
+               using mpark::visit;
+               visit( [&](auto&& array) {
+                       using value_type = typename std::decay_t<decltype(array)>::value_type;
+                       writer.template writePDataArray< value_type >( name, numberOfComponents );
+                   },
+                   array
+               );
+            },
+            py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1)
+        // NOTE: only the overload intended for sequential writing is exported, because we don't
+        // have type casters for MPI_Comm (ideally, it would be compatible with the mpi4py objects)
+        .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ),
+              py::arg("mainFileName"), py::arg("subdomainIndex"))
+    ;
+}
+
+void export_DistributedMeshWriters( py::module & m )
+{
+    constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed;
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges,        default_format >( m, "PVTUWriter_MeshOfEdges" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles,    default_format >( m, "PVTUWriter_MeshOfTriangles" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfQuadrangles,    default_format >( m, "PVTUWriter_MeshOfQuadrangles" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" );
+    export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfHexahedrons, default_format >( m, "PVTUWriter_MeshOfHexahedrons" );
+}
diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a422795b6e8bd47a799379329c9252aefb831920
--- /dev/null
+++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp
@@ -0,0 +1,49 @@
+#include "../exceptions.h"
+#include "../typedefs.h"
+
+// conversions have to be registered for each object file
+#include "../tnl_conversions.h"
+#include "TNL/MPI/Wrappers.h"
+
+// external functions
+void export_DistributedMeshes( py::module & m );
+void export_DistributedMeshReaders( py::module & m );
+void export_DistributedMeshWriters( py::module & m );
+
+#include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
+
+// Python module definition
+PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m)
+{
+    register_exceptions(m);
+
+    // MPI initialization and finalization
+    // https://stackoverflow.com/q/64647846
+    if( ! TNL::MPI::Initialized() ) {
+        int argc = 0;
+        char** argv = nullptr;
+        TNL::MPI::Init( argc, argv );
+    }
+    // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors
+    auto cleanup_callback = []() {
+        if( TNL::MPI::Initialized() && ! TNL::MPI::Finalized() )
+            TNL::MPI::Finalize();
+    };
+    m.add_object("_cleanup", py::capsule(cleanup_callback));
+
+    // bindings for distributed data structures
+    export_DistributedMeshes(m);
+    export_DistributedMeshReaders(m);
+    export_DistributedMeshWriters(m);
+
+    // bindings for functions
+    using TNL::Meshes::DistributedMeshes::distributeSubentities;
+    m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) {
+          distributeSubentities< 1 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfQuadrangles& mesh ) {
+          distributeSubentities< 1 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) {
+          distributeSubentities< 2 >( mesh ); });
+    m.def("distributeFaces", []( DistributedMeshOfHexahedrons& mesh ) {
+          distributeSubentities< 2 >( mesh ); });
+}
diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h
index 7a74237f02b6bb150a02b28e1b42bc9e343ea47c..7bc9fe0256f88c212995b987100000bbe808ce47 100644
--- a/src/Python/pytnl/typedefs.h
+++ b/src/Python/pytnl/typedefs.h
@@ -1,11 +1,28 @@
 #pragma once
 
+// helper macros (the _NX variants are needed to expand macros in the arguments)
+#define PYTNL_STRINGIFY(U) PYTNL_STRINGIFY_NX(U)
+#define PYTNL_STRINGIFY_NX(U) #U
+
+#define PYTNL_PPCAT(A, B) PYTNL_PPCAT_NX(A, B)
+#define PYTNL_PPCAT_NX(A, B) A ## B
+
+// the Python module name depends on the build type, this macro can be used to concatenate with the correct suffix
+#ifdef PYTNL_MODULE_POSTFIX
+   #define PYTNL_MODULE_NAME(name) PYTNL_PPCAT(name, PYTNL_MODULE_POSTFIX)
+#else
+   #define PYTNL_MODULE_NAME(name) name
+#endif
+
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Meshes/Mesh.h>
+#include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DefaultConfig.h>
 #include <TNL/Meshes/Topologies/Edge.h>
 #include <TNL/Meshes/Topologies/Triangle.h>
+#include <TNL/Meshes/Topologies/Quadrangle.h>
 #include <TNL/Meshes/Topologies/Tetrahedron.h>
+#include <TNL/Meshes/Topologies/Hexahedron.h>
 
 using RealType = double;
 using DeviceType = TNL::Devices::Host;
@@ -16,24 +33,22 @@ using Grid2D = TNL::Meshes::Grid<2, RealType, DeviceType, IndexType>;
 using Grid3D = TNL::Meshes::Grid<3, RealType, DeviceType, IndexType>;
 
 using LocalIndexType = short int;
-using EdgeTopology = TNL::Meshes::Topologies::Edge;
-using TriangleTopology = TNL::Meshes::Topologies::Triangle;
-using TetrahedronTopology = TNL::Meshes::Topologies::Tetrahedron;
-using MeshOfEdges = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            EdgeTopology,
-                            EdgeTopology::dimension,
-                            RealType,
-                            IndexType,
-                            LocalIndexType > >;
-using MeshOfTriangles = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            TriangleTopology,
-                            TriangleTopology::dimension,
-                            RealType,
-                            IndexType,
-                            LocalIndexType > >;
-using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
-                            TetrahedronTopology,
-                            TetrahedronTopology::dimension,
+template< typename Topology >
+using DefaultMeshTemplate = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig<
+                            Topology,
+                            Topology::dimension,
                             RealType,
                             IndexType,
                             LocalIndexType > >;
+
+using MeshOfEdges = DefaultMeshTemplate< TNL::Meshes::Topologies::Edge >;
+using MeshOfTriangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Triangle >;
+using MeshOfQuadrangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Quadrangle >;
+using MeshOfTetrahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Tetrahedron >;
+using MeshOfHexahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Hexahedron >;
+
+using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >;
+using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >;
+using DistributedMeshOfQuadrangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfQuadrangles >;
+using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >;
+using DistributedMeshOfHexahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfHexahedrons >;
diff --git a/src/Python/pytnl/variant_caster.h b/src/Python/pytnl/variant_caster.h
new file mode 100644
index 0000000000000000000000000000000000000000..c032448b598754e148e632004e16a9826ef247e3
--- /dev/null
+++ b/src/Python/pytnl/variant_caster.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <mpark/variant.hpp>   // backport of std::variant from C++17
+
+namespace pybind11 { namespace detail {
+
+// add specialization for concrete variant type
+// (variant_caster is implemented in pybind11 and used for C++17's std::variant casting)
+template<class... Args> struct type_caster<mpark::variant<Args...>>
+    : variant_caster<mpark::variant<Args...>> {};
+
+}} // namespace pybind11::detail
diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h
index 742acd5ed923b4d0e0cbf14e37be8fb40866ec06..aa7c008a7b6b5ccfe1445daebdc4312976eead0b 100644
--- a/src/TNL/Algorithms/DistributedScan.h
+++ b/src/TNL/Algorithms/DistributedScan.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Algorithms/Scan.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Algorithms {
@@ -32,10 +33,9 @@ struct DistributedScan
    {
       using RealType = typename DistributedVector::RealType;
       using DeviceType = typename DistributedVector::DeviceType;
-      using CommunicatorType = typename DistributedVector::CommunicatorType;
 
       const auto group = v.getCommunicationGroup();
-      if( group != CommunicatorType::NullGroup ) {
+      if( group != MPI::NullGroup() ) {
          // adjust begin and end for the local range
          const auto localRange = v.getLocalRange();
          begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
@@ -47,18 +47,18 @@ struct DistributedScan
          const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
 
          // exchange local sums between ranks
-         const int nproc = CommunicatorType::GetSize( group );
+         const int nproc = MPI::GetSize( group );
          RealType dataForScatter[ nproc ];
          for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
          Containers::Vector< RealType, Devices::Host > rankSums( nproc );
          // NOTE: exchanging general data types does not work with MPI
-         CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
+         MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
 
          // compute the scan of the per-rank sums
          Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
 
          // perform second phase: shift by the per-block and per-rank offsets
-         const int rank = CommunicatorType::GetRank( group );
+         const int rank = MPI::GetRank( group );
          Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
       }
    }
diff --git a/src/TNL/Communicators/MPITypeResolver.h b/src/TNL/Communicators/MPITypeResolver.h
deleted file mode 100644
index 5429d5e33c970576fac1856f3624eeef7a06a458..0000000000000000000000000000000000000000
--- a/src/TNL/Communicators/MPITypeResolver.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
-                          MPITypeResolver.h  -  description
-                             -------------------
-    begin                : Feb 4, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Communicators {
-
-#ifdef HAVE_MPI
-template<typename Type>
-struct MPITypeResolver
-{
-   static inline MPI_Datatype getType()
-   {
-      static_assert( sizeof(Type) == sizeof(char) ||
-                     sizeof(Type) == sizeof(int) ||
-                     sizeof(Type) == sizeof(short int) ||
-                     sizeof(Type) == sizeof(long int),
-                     "Fatal Error - Unknown MPI Type");
-      switch( sizeof( Type ) )
-      {
-         case sizeof( char ):
-            return MPI_CHAR;
-         case sizeof( int ):
-            return MPI_INT;
-         case sizeof( short int ):
-            return MPI_SHORT;
-         case sizeof( long int ):
-            return MPI_LONG;
-      }
-      // this will never happen thanks to the static_assert above, but icpc is not that smart
-      // and complains about missing return statement at the end of non-void function
-      throw 0;
-   }
-};
-
-template<> struct MPITypeResolver< char >
-{
-    static inline MPI_Datatype getType(){return MPI_CHAR;};
-};
-
-template<> struct MPITypeResolver< int >
-{
-    static inline MPI_Datatype getType(){return MPI_INT;};
-};
-
-template<> struct MPITypeResolver< short int >
-{
-    static inline MPI_Datatype getType(){return MPI_SHORT;};
-};
-
-template<> struct MPITypeResolver< long int >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG;};
-};
-
-template<> struct MPITypeResolver< unsigned char >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
-};
-
-template<> struct MPITypeResolver< unsigned short int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
-};
-
-template<> struct MPITypeResolver< unsigned int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
-};
-
-template<> struct MPITypeResolver< unsigned long int >
-{
-    static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
-};
-
-template<> struct MPITypeResolver< float >
-{
-    static inline MPI_Datatype getType(){return MPI_FLOAT;};
-};
-
-template<> struct MPITypeResolver< double >
-{
-    static inline MPI_Datatype getType(){return MPI_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< long double >
-{
-    static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
-};
-
-template<> struct MPITypeResolver< bool >
-{
-   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
-   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
-   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
-};
-#endif
-
-} // namespace Communicators
-} // namespace TNL
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 1382fb7a6fef4877d9beb21d0a5223245ac74d16..cd51629687444ce64b2bcb3fd61c3491e3ecce9a 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -10,38 +10,10 @@
 
 #pragma once
 
-#include <iostream>
-#include <fstream>
-#include <cstring>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#ifdef OMPI_MAJOR_VERSION
-   // header specific to OpenMPI (needed for CUDA-aware detection)
-   #include <mpi-ext.h>
-#endif
-
-#include <unistd.h>  // getpid
-
-#ifdef HAVE_CUDA
-    #include <TNL/Cuda/CheckDevice.h>
-
-    typedef struct __attribute__((__packed__))  {
-       char name[MPI_MAX_PROCESSOR_NAME];
-    } procName;
-#endif
-
-#endif
-
-#include <TNL/String.h>
-#include <TNL/Logger.h>
-#include <TNL/Debugging/OutputRedirection.h>
-#include <TNL/Communicators/MpiDefs.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Exceptions/MPISupportMissing.h>
-#include <TNL/Exceptions/MPIDimsCreateError.h>
-#include <TNL/Communicators/MPITypeResolver.h>
-
+#include <TNL/MPI/Wrappers.h>
+#include <TNL/MPI/DummyDefs.h>
+#include <TNL/MPI/Utils.h>
+#include <TNL/MPI/Config.h>
 
 namespace TNL {
 //! \brief Namespace for TNL communicators.
@@ -49,7 +21,8 @@ namespace Communicators {
 namespace {
 
 //! \brief MPI communicator.
-class MpiCommunicator
+class [[deprecated("use the functions in the TNL::MPI namespace instead")]]
+MpiCommunicator
 {
    public:
 #ifdef HAVE_MPI
@@ -71,275 +44,81 @@ class MpiCommunicator
 
       static void configSetup( Config::ConfigDescription& config, const String& prefix = "" )
       {
-#ifdef HAVE_MPI
-         config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true );
-         config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false );
-         config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 );
-#endif
+         MPI::configSetup( config, prefix );
       }
 
       static bool setup( const Config::ParameterContainer& parameters,
                          const String& prefix = "" )
       {
-#ifdef HAVE_MPI
-         if(IsInitialized())//i.e. - isUsed
-         {
-            const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
-            if( redirect )
-               setupRedirection();
-#ifdef HAVE_CUDA
-            int size;
-            MPI_Comm_size( MPI_COMM_WORLD, &size );
-            if( size > 1 )
-            {
-   #if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT
-               std::cout << "CUDA-aware MPI detected on this system ... " << std::endl;
-   #elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT
-               std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl;
-               return false;
-   #else
-               std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl;
-   #endif
-            }
-#endif // HAVE_CUDA
-            bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" );
-            int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" );
-
-            if( gdbDebug )
-            {
-               int rank = GetRank( MPI_COMM_WORLD );
-               int pid = getpid();
-
-               volatile int tnlMPIDebugAttached = 0;
-               MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD );
-               MPI_Barrier( MPI_COMM_WORLD );
-               if( rank == 0 )
-               {
-                  std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl;
-                  for( int i = 0; i < GetSize( MPI_COMM_WORLD ); i++ )
-                  {
-                     MPI_Status status;
-                     int recvPid;
-                     MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status );
-
-                     if( i == processToAttach || processToAttach == -1 )
-                     {
-                        std::cout << "  For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\""
-                                  << " -ex \"set variable tnlMPIDebugAttached=1\""
-                                  << " -ex \"continue\"" << std::endl;
-                     }
-                  }
-                  std::cout << std::flush;
-               }
-               if( rank == processToAttach || processToAttach == -1 )
-                  while( ! tnlMPIDebugAttached );
-               MPI_Barrier( MPI_COMM_WORLD );
-            }
-         }
-#endif // HAVE_MPI
-         return true;
+         return MPI::setup( parameters, prefix );
       }
 
-      static void Init(int& argc, char**& argv )
+      static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
       {
-#ifdef HAVE_MPI
-         MPI_Init( &argc, &argv );
-         selectGPU();
-#endif
+         MPI::Init( argc, argv, required_thread_level );
 
          // silence warnings about (potentially) unused variables
          (void) NullGroup;
-         (void) NullRequest;
-      }
-
-      static void setupRedirection()
-      {
-#ifdef HAVE_MPI
-         if(isDistributed() )
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               const std::string stdoutFile = std::string("./stdout_") + std::to_string(GetRank(AllGroup)) + ".txt";
-               const std::string stderrFile = std::string("./stderr_") + std::to_string(GetRank(AllGroup)) + ".txt";
-               std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
-               Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
-            }
-         }
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
       }
 
       static void Finalize()
       {
-#ifdef HAVE_MPI
-         if(isDistributed())
-         {
-            if(GetRank(AllGroup)!=0)
-            {
-               // restore redirection (not necessary, it uses RAII internally...)
-               Debugging::redirect_stdout_stderr( "", "", true );
-            }
-         }
-         MPI_Finalize();
-#endif
+         MPI::Finalize();
       }
 
       static bool IsInitialized()
       {
-#ifdef HAVE_MPI
-         int initialized, finalized;
-         MPI_Initialized(&initialized);
-         MPI_Finalized(&finalized);
-         return initialized && !finalized;
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         return MPI::isInitialized();
       }
 
       static int GetRank(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetRank cannot be called with NullGroup");
-         int rank;
-         MPI_Comm_rank(group,&rank);
-         return rank;
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         return MPI::GetRank( group );
       }
 
       static int GetSize(CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "GetSize cannot be called with NullGroup");
-         int size;
-         MPI_Comm_size(group,&size);
-         return size;
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
-      }
-
-#ifdef HAVE_MPI
-      template< typename T >
-      static MPI_Datatype getDataType( const T& t )
-      {
-         return MPITypeResolver< T >::getType();
-      }
-#endif
-
-      //dim-number of dimensions, distr array of guess distr - 0 for computation
-      //distr array will be filled by computed distribution
-      //more information in MPI documentation
-      static void DimsCreate(int nproc, int dim, int *distr)
-      {
-#ifdef HAVE_MPI
-         int sum = 0, prod = 1;
-         for( int i = 0;i < dim; i++ ) {
-            sum += distr[ i ];
-            prod *= distr[ i ];
-         }
-         if( prod != 0 && prod != GetSize( AllGroup ) )
-            throw Exceptions::MPIDimsCreateError();
-         if(sum==0) {
-            for(int i=0;i<dim-1;i++)
-               distr[i]=1;
-            distr[dim-1]=0;
-         }
-
-         MPI_Dims_create(nproc, dim, distr);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         return MPI::GetSize( group );
       }
 
       static void Barrier( CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Barrier cannot be called with NullGroup");
-         MPI_Barrier(group);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Barrier( group );
       }
 
       template <typename T>
       static void Send( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup");
-         MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Send( data, count, dest, tag, group );
       }
 
       template <typename T>
       static void Recv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
-     }
+         MPI::Recv( data, count, src, tag, group );
+      }
 
       template <typename T>
       static Request ISend( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "ISend cannot be called with NullGroup");
-         Request req;
-         MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req);
-         return req;
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         return MPI::Isend( data, count, dest, tag, group );
       }
 
       template <typename T>
       static Request IRecv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "IRecv cannot be called with NullGroup");
-         Request req;
-         MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req);
-         return req;
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         return MPI::Irecv( data, count, src, tag, group );
       }
 
       static void WaitAll(Request *reqs, int length)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Waitall( reqs, length );
       }
 
       template< typename T >
       static void Bcast( T* data, int count, int root, CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
-         TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
-         MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Bcast( data, count, root, group );
       }
 
       template< typename T >
@@ -349,12 +128,7 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Allreduce( data, reduced_data, count, op, group );
       }
 
       // in-place variant of Allreduce
@@ -364,29 +138,18 @@ class MpiCommunicator
                              const MPI_Op &op,
                              CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
-         MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Allreduce( data, count, op, group );
       }
 
-
       template< typename T >
       static void Reduce( const T* data,
                           T* reduced_data,
                           int count,
-                          MPI_Op &op,
+                          const MPI_Op &op,
                           int root,
                           CommunicationGroup group)
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup");
-         MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Reduce( data, reduced_data, count, op, root, group );
       }
 
       template< typename T >
@@ -400,24 +163,7 @@ class MpiCommunicator
                                int receiveTag,
                                CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Status status;
-         MPI_Sendrecv( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       destination,
-                       sendTag,
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       source,
-                       receiveTag,
-                       group,
-                       &status );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Sendrecv( sendData, sendCount, destination, sendTag, receiveData, receiveCount, source, receiveTag, group );
       }
 
       template< typename T >
@@ -427,94 +173,20 @@ class MpiCommunicator
                             int receiveCount,
                             CommunicationGroup group )
       {
-#ifdef HAVE_MPI
-         TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup");
-         MPI_Alltoall( const_cast< void* >( ( void* ) sendData ),
-                       sendCount,
-                       MPITypeResolver< T >::getType(),
-                       ( void* ) receiveData,
-                       receiveCount,
-                       MPITypeResolver< T >::getType(),
-                       group );
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
-      }
-
-
-      static void writeProlog( Logger& logger )
-      {
-         if( isDistributed() )
-         {
-            logger.writeParameter( "MPI processes:", GetSize(AllGroup) );
-         }
-      }
-
-      static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup )
-      {
-#ifdef HAVE_MPI
-         if(meToo)
-            MPI_Comm_split(oldGroup, 1, myRank, &newGroup);
-         else
-            MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup);
-#else
-         throw Exceptions::MPISupportMissing();
-#endif
+         MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group );
       }
 
 #ifdef HAVE_MPI
-      static MPI_Request NullRequest;
       static MPI_Comm AllGroup;
       static MPI_Comm NullGroup;
 #else
-      static constexpr int NullRequest = -1;
       static constexpr int AllGroup = 1;
       static constexpr int NullGroup = 0;
 #endif
    private:
-
-      static void selectGPU(void)
-      {
-#ifdef HAVE_MPI
-    #ifdef HAVE_CUDA
-         const int count = GetSize(AllGroup);
-         const int rank = GetRank(AllGroup);
-         int gpuCount;
-         cudaGetDeviceCount(&gpuCount);
-
-         procName names[count];
-
-         int i=0;
-         int len;
-         MPI_Get_processor_name(names[rank].name, &len);
-
-         for(i=0;i<count;i++)
-            std::memcpy(names[i].name,names[rank].name,len+1);
-
-         MPI_Alltoall( (void*)names ,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-            (void*)names,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,
-                     MPI_COMM_WORLD);
-
-         int nodeRank=0;
-         for(i=0;i<rank;i++)
-         {
-            if(std::strcmp(names[rank].name,names[i].name)==0)
-               nodeRank++;
-         }
-
-         const int gpuNumber = nodeRank % gpuCount;
-
-         cudaSetDevice(gpuNumber);
-         TNL_CHECK_CUDA_DEVICE;
-
-         //std::cout<<"Node: " << rank << " gpu: " << gpuNumber << std::endl;
-    #endif
-#endif
-      }
 };
 
 #ifdef HAVE_MPI
-MPI_Request MpiCommunicator::NullRequest = MPI_REQUEST_NULL;
 MPI_Comm MpiCommunicator::AllGroup = MPI_COMM_WORLD;
 MPI_Comm MpiCommunicator::NullGroup = MPI_COMM_NULL;
 #endif
diff --git a/src/TNL/Communicators/MpiDefs.h b/src/TNL/Communicators/MpiDefs.h
deleted file mode 100644
index 957354b9d0ea911c4269154486af8e95f4a865a9..0000000000000000000000000000000000000000
--- a/src/TNL/Communicators/MpiDefs.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
-                          MpiCommunicator.h  -  description
-                             -------------------
-    begin                : 2005/04/23
-    copyright            : (C) 2005 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#ifndef HAVE_MPI
-enum MPI_Op {
-   MPI_MAX,
-   MPI_MIN,
-   MPI_SUM,
-   MPI_PROD,
-   MPI_LAND,
-   MPI_BAND,
-   MPI_LOR,
-   MPI_BOR,
-   MPI_LXOR,
-   MPI_BXOR,
-   MPI_MINLOC,
-   MPI_MAXLOC,
-};
-#endif
diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h
deleted file mode 100644
index c0d89015be40d4fd72d279e9635b393381a63b36..0000000000000000000000000000000000000000
--- a/src/TNL/Communicators/NoDistrCommunicator.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************
-                          NoDistrCommunicator.h  -  description
-                             -------------------
-    begin                : Jan 9, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Logger.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Communicators/MpiDefs.h>
-
-namespace TNL {
-namespace Communicators {
-
-//! \brief Dummy communicator without any distribution support.
-class NoDistrCommunicator
-{
-   public:
-      using Request = int;
-      using CommunicationGroup = int;
-      static constexpr Request NullRequest = -1;
-      static constexpr CommunicationGroup AllGroup = 1;
-      static constexpr CommunicationGroup NullGroup = 0;
-
-      static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ){};
-
-      static bool setup( const Config::ParameterContainer& parameters,
-                         const String& prefix = "" )
-      {
-         return true;
-      }
-
-      static void Init(int& argc, char**& argv) {}
-
-      static void setupRedirection(){}
-
-      static void Finalize(){}
-
-      static bool IsInitialized()
-      {
-          return true;
-      }
-
-      static bool isDistributed()
-      {
-          return false;
-      }
-
-      static int GetRank(CommunicationGroup group = AllGroup )
-      {
-          return 0;
-      }
-
-      static int GetSize(CommunicationGroup group = AllGroup )
-      {
-          return 1;
-      }
-
-      static void DimsCreate(int nproc, int dim, int *distr)
-      {
-          for(int i=0;i<dim;i++)
-          {
-              distr[i]=1;
-          }
-      }
-
-      static void Barrier(CommunicationGroup group = AllGroup)
-      {
-      }
-
-      template <typename T>
-      static Request ISend( const T *data, int count, int dest, int tag, CommunicationGroup group)
-      {
-          return 1;
-      }
-
-      template <typename T>
-      static Request IRecv( const T *data, int count, int src, int tag, CommunicationGroup group)
-      {
-          return 1;
-      }
-
-      static void WaitAll(Request *reqs, int length)
-      {
-      }
-
-      template< typename T >
-      static void Bcast( T* data, int count, int root, CommunicationGroup group)
-      {
-      }
-
-      template< typename T >
-      static void Allreduce( const T* data,
-                             T* reduced_data,
-                             int count,
-                             const MPI_Op &op,
-                             CommunicationGroup group )
-      {
-         memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
-      }
-
-      // in-place variant of Allreduce
-      template< typename T >
-      static void Allreduce( T* data,
-                             int count,
-                             const MPI_Op &op,
-                             CommunicationGroup group )
-      {
-      }
-
-      template< typename T >
-      static void Reduce( T* data,
-                          T* reduced_data,
-                          int count,
-                          MPI_Op &op,
-                          int root,
-                          CommunicationGroup group )
-      {
-         memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) );
-      }
-
-      template< typename T >
-      static void Alltoall( const T* sendData,
-                            int sendCount,
-                            T* receiveData,
-                            int receiveCount,
-                            CommunicationGroup group )
-      {
-         TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount for NoDistrCommunicator." );
-         memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) );
-      }
-
-      static void CreateNewGroup(bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup)
-      {
-         newGroup=oldGroup;
-      }
-
-      static void writeProlog( Logger& logger )
-      {
-      }
-};
-
-} // namespace Communicators
-} // namespace TNL
diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bfed4d92ce6c7e7ee2384a7644217537fa75887
--- /dev/null
+++ b/src/TNL/Containers/ByteArraySynchronizer.h
@@ -0,0 +1,147 @@
+/***************************************************************************
+                          ByteArraySynchronizer.h  -  description
+                             -------------------
+    begin                : November 17, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+#pragma once
+
+#include <future>
+// 3rd-party async library providing a thread-pool
+#include <async/threadpool.h>
+
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/MPI/Wrappers.h>
+#include <TNL/Timer.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename Device, typename Index >
+class ByteArraySynchronizer
+{
+private:
+   // NOTE: async::threadpool has alignment requirements, which causes problems:
+   //  - it may become misaligned in derived classes, see e.g.
+   //    https://stackoverflow.com/a/46475498
+   //    solution: specify it as the first member of the base class
+   //  - operator new before C++17 may not support over-aligned types, see
+   //    https://stackoverflow.com/a/53485295
+   //    solution: relaxed alignment requirements to not exceed the value of
+   //    alignof(std::max_align_t), which is the strongest alignment supported
+   //    by plain new. See https://github.com/d36u9/async/pull/2
+   async::threadpool tp;
+
+   int gpu_id = 0;
+
+public:
+   using ByteArrayView = ArrayView< std::uint8_t, Device, Index >;
+   using RequestsVector = std::vector< MPI_Request >;
+
+   enum class AsyncPolicy {
+      synchronous,
+      deferred,
+      threadpool,
+      async,
+   };
+
+   ByteArraySynchronizer() : tp(1) {}
+
+   virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) = 0;
+
+   virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) = 0;
+
+   /**
+    * \brief An asynchronous version of \ref synchronizeByteArray.
+    *
+    * Note that this method is not thread-safe - only the thread which created
+    * and "owns" the instance of this object can call this method.
+    *
+    * Note that at most one async operation may be active at a time, the
+    * following calls will block until the pending operation is finished.
+    */
+   void synchronizeByteArrayAsync( ByteArrayView array, int bytesPerValue, AsyncPolicy policy = AsyncPolicy::synchronous )
+   {
+      // wait for any previous synchronization (multiple objects can share the
+      // same synchronizer)
+      if( async_op.valid() ) {
+         async_wait_before_start_timer.start();
+         async_op.wait();
+         async_wait_before_start_timer.stop();
+      }
+
+      async_start_timer.start();
+
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< Device, Devices::Cuda >::value )
+         cudaGetDevice(&gpu_id);
+      #endif
+
+      if( policy == AsyncPolicy::threadpool || policy == AsyncPolicy::async ) {
+         // everything offloaded to a separate thread
+         auto worker = [=] () {
+            // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+            #ifdef HAVE_CUDA
+            if( std::is_same< Device, Devices::Cuda >::value )
+               cudaSetDevice(this->gpu_id);
+            #endif
+
+            this->synchronizeByteArray( array, bytesPerValue );
+         };
+
+         if( policy == AsyncPolicy::threadpool )
+            async_op = tp.post( worker );
+         else
+            async_op = std::async( std::launch::async, worker );
+      }
+      else if( policy == AsyncPolicy::deferred ) {
+         // immediate start, deferred synchronization (but still in the same thread)
+         auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+         auto worker = [requests] () mutable {
+            MPI::Waitall( requests.data(), requests.size() );
+         };
+         this->async_op = std::async( std::launch::deferred, worker );
+      }
+      else {
+         // synchronous
+         synchronizeByteArray( array, bytesPerValue );
+      }
+
+      async_ops_count++;
+      async_start_timer.stop();
+   }
+
+   virtual ~ByteArraySynchronizer() = default;
+
+   /**
+    * \brief Can be used for checking if a synchronization started
+    * asynchronously has been finished.
+    *
+    * Note that derived classes *must* make this check in the destructor,
+    * otherwise running \ref synchronizeByteArrayAsync would lead to the error
+    * `pure virtual method called` when the derived object is destructed before
+    * the async operation finishes. This cannot be implemented in the base class
+    * destructor, because the derived destructor is run first.
+    *
+    *    ~Derived()
+    *    {
+    *       if( this->async_op.valid() )
+    *          this->async_op.wait();
+    *    }
+    */
+   std::future< void > async_op;
+
+   // attributes for profiling
+   Timer async_wait_before_start_timer, async_start_timer, async_wait_timer;
+   std::size_t async_ops_count = 0;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 66dd8a8f0a07f9ed3a60ddc6fbc0471c17008bb3..3947bfec438a31307b32241a4bebc9e6a4324ab7 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -21,22 +21,22 @@ namespace Containers {
 template< typename Value,
           typename Device = Devices::Host,
           typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class DistributedArray
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using LocalArrayType = Containers::Array< Value, Device, Index >;
+   using LocalArrayType = Containers::Array< Value, Device, Index, Allocator >;
 
 public:
    using ValueType = Value;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
+   using AllocatorType = Allocator;
    using LocalRangeType = Subrange< Index >;
    using LocalViewType = Containers::ArrayView< Value, Device, Index >;
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
-   using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
-   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using ViewType = DistributedArrayView< Value, Device, Index >;
+   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >;
+   using SynchronizerType = typename ViewType::SynchronizerType;
 
    /**
     * \brief A template which allows to quickly obtain a \ref DistributedArray type with changed template parameters.
@@ -44,52 +44,86 @@ public:
    template< typename _Value,
              typename _Device = Device,
              typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedArray< _Value, _Device, _Index, _Communicator >;
+             typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Value > >
+   using Self = DistributedArray< _Value, _Device, _Index, _Allocator >;
 
 
+   ~DistributedArray();
+
+   /**
+    * \brief Constructs an empty array with zero size.
+    */
    DistributedArray() = default;
 
-   DistributedArray( const DistributedArray& ) = default;
+   /**
+    * \brief Constructs an empty array and sets the provided allocator.
+    *
+    * \param allocator The allocator to be associated with this array.
+    */
+   explicit DistributedArray( const AllocatorType& allocator );
+
+   /**
+    * \brief Copy constructor (makes a deep copy).
+    *
+    * \param array The array to be copied.
+    */
+   explicit DistributedArray( const DistributedArray& array );
+
+   /**
+    * \brief Copy constructor with a specific allocator (makes a deep copy).
+    *
+    * \param array The array to be copied.
+    * \param allocator The allocator to be associated with this array.
+    */
+   explicit DistributedArray( const DistributedArray& array, const AllocatorType& allocator );
 
-   DistributedArray( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup(), const AllocatorType& allocator = AllocatorType() );
 
-   void setDistribution( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup );
+   void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup() );
 
    const LocalRangeType& getLocalRange() const;
 
-   CommunicationGroup getCommunicationGroup() const;
+   IndexType getGhosts() const;
+
+   MPI_Comm getCommunicationGroup() const;
+
+   AllocatorType getAllocator() const;
 
    /**
     * \brief Returns a modifiable view of the local part of the array.
-    *
-    * If \e begin or \e end is set to a non-zero value, a view for the
-    * sub-interval `[begin, end)` is returned. Otherwise a view for whole
-    * local part of the array view is returned.
-    *
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
     */
    LocalViewType getLocalView();
 
    /**
     * \brief Returns a non-modifiable view of the local part of the array.
-    *
-    * If \e begin or \e end is set to a non-zero value, a view for the
-    * sub-interval `[begin, end)` is returned. Otherwise a view for whole
-    * local part of the array view is returned.
-    *
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
     */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the array,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the array,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    void copyFromGlobal( ConstLocalViewType globalArray );
 
+   // synchronizer stuff
+   void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 );
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const;
+
+   int getValuesPerElement() const;
+
+   void startSynchronization();
+
+   void waitForSynchronization() const;
+
 
    // Usual Array methods follow below.
 
@@ -168,10 +202,17 @@ public:
    // TODO: serialization (save, load)
 
 protected:
-   LocalRangeType localRange;
-   IndexType globalSize = 0;
-   CommunicationGroup group = Communicator::NullGroup;
+   ViewType view;
    LocalArrayType localData;
+
+private:
+   template< typename Array, std::enable_if_t< std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true >
+   static void setSynchronizerHelper( ViewType& view, const Array& array )
+   {
+      view.setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
+   }
+   template< typename Array, std::enable_if_t< ! std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true >
+   static void setSynchronizerHelper( ViewType& view, const Array& array ) {}
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index c146bbf9f8657e6af5f38a8506d9c944a539c57a..e9ee120932070bfb7cb57e1e65ecd38da1cd01ce 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -15,7 +15,6 @@
 #include "DistributedArray.h"
 
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Communicators/MpiDefs.h>  // important only when MPI is disabled
 
 namespace TNL {
 namespace Containers {
@@ -23,94 +22,226 @@ namespace Containers {
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
-DistributedArray( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group )
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+~DistributedArray()
 {
-   setDistribution( localRange, globalSize, group );
+   // Wait for pending async operation, otherwise the synchronizer would crash
+   // if the array goes out of scope.
+   waitForSynchronization();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( const Allocator& allocator )
+: localData( allocator )
+{
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( const DistributedArray& array )
+{
+   setLike( array );
+   view = array;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( const DistributedArray& array, const Allocator& allocator )
+: localData( allocator )
+{
+   setLike( array );
+   view = array;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
+DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, const Allocator& allocator )
+: localData( allocator )
+{
+   setDistribution( localRange, ghosts, globalSize, group );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
-setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group )
+DistributedArray< Value, Device, Index, Allocator >::
+setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group )
 {
    TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" );
-   this->localRange = localRange;
-   this->globalSize = globalSize;
-   this->group = group;
-   if( group != Communicator::NullGroup )
-      localData.setSize( localRange.getSize() );
+   if( group != MPI::NullGroup() )
+      localData.setSize( localRange.getSize() + ghosts );
+   view.bind( localRange, ghosts, globalSize, group, localData.getView() );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 const Subrange< Index >&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getLocalRange() const
 {
-   return localRange;
+   return view.getLocalRange();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+Index
+DistributedArray< Value, Device, Index, Allocator >::
+getGhosts() const
+{
+   return view.getGhosts();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename Communicator::CommunicationGroup
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+MPI_Comm
+DistributedArray< Value, Device, Index, Allocator >::
 getCommunicationGroup() const
 {
-   return group;
+   return view.getCommunicationGroup();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+Allocator
+DistributedArray< Value, Device, Index, Allocator >::
+getAllocator() const
+{
+   return localData.getAllocator();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getLocalView()
 {
-   return localData.getView();
+   return view.getLocalView();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getConstLocalView() const
 {
-   return localData.getConstView();
+   return view.getConstLocalView();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
+getLocalViewWithGhosts()
+{
+   return view.getLocalViewWithGhosts();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType
+DistributedArray< Value, Device, Index, Allocator >::
+getConstLocalViewWithGhosts() const
+{
+   return view.getConstLocalViewWithGhosts();
 }
 
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 copyFromGlobal( ConstLocalViewType globalArray )
 {
-   TNL_ASSERT_EQ( getSize(), globalArray.getSize(),
-                  "given global array has different size than the distributed array" );
+   view.copyFromGlobal( globalArray );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
+{
+   view.setSynchronizer( synchronizer, valuesPerElement );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType >
+DistributedArray< Value, Device, Index, Allocator >::
+getSynchronizer() const
+{
+   return view.getSynchronizer();
+}
 
-   LocalViewType localView( localData );
-   const LocalRangeType localRange = getLocalRange();
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+int
+DistributedArray< Value, Device, Index, Allocator >::
+getValuesPerElement() const
+{
+   return view.getValuesPerElement();
+}
 
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      localView[ i ] = globalArray[ localRange.getGlobalIndex( i ) ];
-   };
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+startSynchronization()
+{
+   view.startSynchronization();
+}
 
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel );
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+waitForSynchronization() const
+{
+   view.waitForSynchronization();
 }
 
 
@@ -121,30 +252,30 @@ copyFromGlobal( ConstLocalViewType globalArray )
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getView()
 {
-   return ViewType( getLocalRange(), getSize(), getCommunicationGroup(), getLocalView() );
+   return view;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedArray< Value, Device, Index, Allocator >::ConstViewType
+DistributedArray< Value, Device, Index, Allocator >::
 getConstView() const
 {
-   return ConstViewType( getLocalRange(), getSize(), getCommunicationGroup(), getConstLocalView() );
+   return view.getConstView();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 operator ViewType()
 {
    return getView();
@@ -153,8 +284,8 @@ operator ViewType()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >::
 operator ConstViewType() const
 {
    return getConstView();
@@ -163,206 +294,181 @@ operator ConstViewType() const
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setLike( const Array& array )
 {
-   localRange = array.getLocalRange();
-   globalSize = array.getSize();
-   group = array.getCommunicationGroup();
-   localData.setLike( array.getConstLocalView() );
+   localData.setLike( array.getConstLocalViewWithGhosts() );
+   view.bind( array.getLocalRange(), array.getGhosts(), array.getSize(), array.getCommunicationGroup(), localData.getView() );
+   // set, but do not unset, the synchronizer
+   if( array.getSynchronizer() )
+      setSynchronizerHelper( view, array );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 reset()
 {
-   localRange.reset();
-   globalSize = 0;
-   group = Communicator::NullGroup;
+   view.reset();
    localData.reset();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 empty() const
 {
-   return getSize() == 0;
+   return view.empty();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 Index
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getSize() const
 {
-   return globalSize;
+   return view.getSize();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setValue( ValueType value )
 {
-   localData.setValue( value );
+   view.setValue( value );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 void
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 setElement( IndexType i, ValueType value )
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   localData.setElement( li, value );
+   view.setElement( i, value );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 Value
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 getElement( IndexType i ) const
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData.getElement( li );
+   return view.getElement( i );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 __cuda_callable__
 Value&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator[]( IndexType i )
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData[ li ];
+   return view[ i ];
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 __cuda_callable__
 const Value&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator[]( IndexType i ) const
 {
-   const IndexType li = localRange.getLocalIndex( i );
-   return localData[ li ];
+   return view[ i ];
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedArray< Value, Device, Index, Communicator >&
-DistributedArray< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedArray< Value, Device, Index, Allocator >&
+DistributedArray< Value, Device, Index, Allocator >::
 operator=( const DistributedArray& array )
 {
    setLike( array );
-   localData = array.getConstLocalView();
+   view = array;
    return *this;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array, typename..., typename >
-DistributedArray< Value, Device, Index, Communicator >&
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >&
+DistributedArray< Value, Device, Index, Allocator >::
 operator=( const Array& array )
 {
    setLike( array );
-   localData = array.getConstLocalView();
+   view = array;
    return *this;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator==( const Array& array ) const
 {
-   // we can't run allreduce if the communication groups are different
-   if( group != array.getCommunicationGroup() )
-      return false;
-   const bool localResult =
-         localRange == array.getLocalRange() &&
-         globalSize == array.getSize() &&
-         localData == array.getConstLocalView();
-   bool result = true;
-   if( group != CommunicatorType::NullGroup )
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   return result;
+   return view == array;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Array >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 operator!=( const Array& array ) const
 {
-   return ! (*this == array);
+   return view != array;
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 containsValue( ValueType value ) const
 {
-   bool result = false;
-   if( group != CommunicatorType::NullGroup ) {
-      const bool localResult = localData.containsValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group );
-   }
-   return result;
+   return view.containsValue( value );
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
 bool
-DistributedArray< Value, Device, Index, Communicator >::
+DistributedArray< Value, Device, Index, Allocator >::
 containsOnlyValue( ValueType value ) const
 {
-   bool result = true;
-   if( group != CommunicatorType::NullGroup ) {
-      const bool localResult = localData.containsOnlyValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   }
-   return result;
+   return view.containsOnlyValue( value );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index e17467befa5850d3f5c1d48723c50526f1ee7a39..cb3235ddbb746acf1149a697beaf49b16b39b1aa 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -12,117 +12,120 @@
 
 #pragma once
 
+#include <memory>
+
 #include <TNL/Containers/ArrayView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
+#include <TNL/Containers/ByteArraySynchronizer.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename Value,
           typename Device = Devices::Host,
-          typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Index = int >
 class DistributedArrayView
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = Value;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
    using LocalRangeType = Subrange< Index >;
    using LocalViewType = Containers::ArrayView< Value, Device, Index >;
    using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >;
-   using ViewType = DistributedArrayView< Value, Device, Index, Communicator >;
-   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >;
+   using ViewType = DistributedArrayView< Value, Device, Index >;
+   using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >;
+   using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref DistributedArrayView type with changed template parameters.
     */
    template< typename _Value,
              typename _Device = Device,
-             typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >;
+             typename _Index = Index >
+   using Self = DistributedArrayView< _Value, _Device, _Index >;
+
 
+   ~DistributedArrayView();
 
    // Initialization by raw data
-   __cuda_callable__
-   DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData )
-   : localRange(localRange), globalSize(globalSize), group(group), localData(localData)
+   DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData )
+   : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData)
    {
-      TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(),
+      TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
                      "The local array size does not match the local range of the distributed array." );
+      TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." );
    }
 
-   __cuda_callable__
    DistributedArrayView() = default;
 
-   // Copy-constructor does shallow copy, so views can be passed-by-value into
-   // CUDA kernels and they can be captured-by-value in __cuda_callable__
-   // lambda functions.
-   __cuda_callable__
+   // Copy-constructor does shallow copy.
    DistributedArrayView( const DistributedArrayView& ) = default;
 
    // "Templated copy-constructor" accepting any cv-qualification of Value
    template< typename Value_ >
-   __cuda_callable__
-   DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& );
+   DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& );
 
    // default move-constructor
-   __cuda_callable__
    DistributedArrayView( DistributedArrayView&& ) = default;
 
-   // method for rebinding (reinitialization)
-   // Note that you can also bind directly to Array and other types implicitly
-   // convertible to ArrayView.
-   __cuda_callable__
+   // method for rebinding (reinitialization) to raw data
+   void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData );
+
+   // Note that you can also bind directly to DistributedArray and other types implicitly
+   // convertible to DistributedArrayView.
    void bind( DistributedArrayView view );
 
    // binding to local array via raw pointer
-   // (local range, global size and communication group are preserved)
+   // (local range, ghosts, global size and communication group are preserved)
    template< typename Value_ >
    void bind( Value_* data, IndexType localSize );
 
-   /**
-    * \brief Returns a modifiable view of the array view.
-    */
-   __cuda_callable__
-   ViewType getView();
+   const LocalRangeType& getLocalRange() const;
 
-   /**
-    * \brief Returns a non-modifiable view of the array view.
-    */
-   __cuda_callable__
-   ConstViewType getConstView() const;
+   IndexType getGhosts() const;
 
+   MPI_Comm getCommunicationGroup() const;
 
-   // Copy-assignment does deep copy, just like regular array, but the sizes
-   // must match (i.e. copy-assignment cannot resize).
-   DistributedArrayView& operator=( const DistributedArrayView& view );
+   LocalViewType getLocalView();
 
-   template< typename Array,
-             typename...,
-             typename = std::enable_if_t< HasSubscriptOperator<Array>::value > >
-   DistributedArrayView& operator=( const Array& array );
+   ConstLocalViewType getConstLocalView() const;
 
+   LocalViewType getLocalViewWithGhosts();
 
-   const LocalRangeType& getLocalRange() const;
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
 
-   CommunicationGroup getCommunicationGroup() const;
+   void copyFromGlobal( ConstLocalViewType globalArray );
 
-   LocalViewType getLocalView();
+   // synchronizer stuff
+   void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 );
 
-   ConstLocalViewType getConstLocalView() const;
+   std::shared_ptr< SynchronizerType > getSynchronizer() const;
 
-   void copyFromGlobal( ConstLocalViewType globalArray );
+   int getValuesPerElement() const;
+
+   // Note that this method is not thread-safe - only the thread which created
+   // and "owns" the instance of this object can call this method.
+   void startSynchronization();
+
+   void waitForSynchronization() const;
 
 
    /*
     * Usual ArrayView methods follow below.
     */
 
+   /**
+    * \brief Returns a modifiable view of the array view.
+    */
+   ViewType getView();
+
+   /**
+    * \brief Returns a non-modifiable view of the array view.
+    */
+   ConstViewType getConstView() const;
+
    // Resets the array view to the empty state.
    void reset();
 
@@ -151,6 +154,15 @@ public:
    __cuda_callable__
    const ValueType& operator[]( IndexType i ) const;
 
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   DistributedArrayView& operator=( const DistributedArrayView& view );
+
+   template< typename Array,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Array>::value > >
+   DistributedArrayView& operator=( const Array& array );
+
    // Comparison operators
    template< typename Array >
    bool operator==( const Array& array ) const;
@@ -166,9 +178,13 @@ public:
 
 protected:
    LocalRangeType localRange;
+   IndexType ghosts = 0;
    IndexType globalSize = 0;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    LocalViewType localData;
+
+   std::shared_ptr< SynchronizerType > synchronizer = nullptr;
+   int valuesPerElement = 1;
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 0199229d48cab585b78d6618437d9fbcf275092a..65ecc4101fc0258bec0635aa07b202e83c9f178d 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -19,160 +19,161 @@ namespace Containers {
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
+DistributedArrayView< Value, Device, Index >::
+~DistributedArrayView()
+{
+   // Wait for pending async operation, otherwise the synchronizer might crash
+   // if the view goes out of scope.
+   // (The same thing is done even in DistributedArray, but there might be views
+   // bound to an array without a synchronizer, in which case this helps.)
+   waitForSynchronization();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
    template< typename Value_ >
-__cuda_callable__
-DistributedArrayView< Value, Device, Index, Communicator >::
-DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view )
+DistributedArrayView< Value, Device, Index >::
+DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& view )
 : localRange( view.getLocalRange() ),
+  ghosts( view.getGhosts() ),
   globalSize( view.getSize() ),
   group( view.getCommunicationGroup() ),
-  localData( view.getConstLocalView() )
+  localData( view.getConstLocalViewWithGhosts() ),
+  synchronizer( view.getSynchronizer() ),
+  valuesPerElement( view.getValuesPerElement() )
 {}
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-__cuda_callable__
+          typename Index >
+void
+DistributedArrayView< Value, Device, Index >::
+bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData )
+{
+   TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts,
+                  "The local array size does not match the local range of the distributed array." );
+   TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." );
+
+   this->localRange = localRange;
+   this->ghosts = ghosts;
+   this->globalSize = globalSize;
+   this->group = group;
+   this->localData.bind( localData );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 bind( DistributedArrayView view )
 {
    localRange = view.getLocalRange();
+   ghosts = view.getGhosts();
    globalSize = view.getSize();
    group = view.getCommunicationGroup();
-   localData.bind( view.getLocalView() );
+   localData.bind( view.getLocalViewWithGhosts() );
+   // set, but do not unset, the synchronizer
+   if( view.getSynchronizer() )
+      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Value_ >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 bind( Value_* data, IndexType localSize )
 {
-   TNL_ASSERT_EQ( localSize, localRange.getSize(),
+   TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts,
                   "The local array size does not match the local range of the distributed array." );
    localData.bind( data, localSize );
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-__cuda_callable__
-typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getView()
-{
-   return *this;
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Communicator >
-__cuda_callable__
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getConstView() const
+          typename Index >
+const Subrange< Index >&
+DistributedArrayView< Value, Device, Index >::
+getLocalRange() const
 {
-   return *this;
+   return localRange;
 }
 
-
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
-operator=( const DistributedArrayView& view )
+          typename Index >
+Index
+DistributedArrayView< Value, Device, Index >::
+getGhosts() const
 {
-   TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
-   localData = view.getConstLocalView();
-   return *this;
+   return ghosts;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-   template< typename Array, typename..., typename >
-DistributedArrayView< Value, Device, Index, Communicator >&
-DistributedArrayView< Value, Device, Index, Communicator >::
-operator=( const Array& array )
+          typename Index >
+MPI_Comm
+DistributedArrayView< Value, Device, Index >::
+getCommunicationGroup() const
 {
-   TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
-   TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
-   localData = array.getConstLocalView();
-   return *this;
+   return group;
 }
 
-
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-const Subrange< Index >&
-DistributedArrayView< Value, Device, Index, Communicator >::
-getLocalRange() const
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::LocalViewType
+DistributedArrayView< Value, Device, Index >::
+getLocalView()
 {
-   return localRange;
+   return LocalViewType( localData.getData(), localRange.getSize() );
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename Communicator::CommunicationGroup
-DistributedArrayView< Value, Device, Index, Communicator >::
-getCommunicationGroup() const
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType
+DistributedArrayView< Value, Device, Index >::
+getConstLocalView() const
 {
-   return group;
+   return ConstLocalViewType( localData.getData(), localRange.getSize() );
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getLocalView()
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::LocalViewType
+DistributedArrayView< Value, Device, Index >::
+getLocalViewWithGhosts()
 {
    return localData;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType
-DistributedArrayView< Value, Device, Index, Communicator >::
-getConstLocalView() const
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType
+DistributedArrayView< Value, Device, Index >::
+getConstLocalViewWithGhosts() const
 {
    return localData;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 copyFromGlobal( ConstLocalViewType globalArray )
 {
    TNL_ASSERT_EQ( getSize(), globalArray.getSize(),
                   "given global array has different size than the distributed array view" );
 
-   LocalViewType localView( localData );
+   LocalViewType localView = getLocalView();
    const LocalRangeType localRange = getLocalRange();
 
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
@@ -181,29 +182,114 @@ copyFromGlobal( ConstLocalViewType globalArray )
    };
 
    Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel );
+   startSynchronization();
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+void
+DistributedArrayView< Value, Device, Index >::
+setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement )
+{
+   this->synchronizer = synchronizer;
+   this->valuesPerElement = valuesPerElement;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType >
+DistributedArrayView< Value, Device, Index >::
+getSynchronizer() const
+{
+   return synchronizer;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+int
+DistributedArrayView< Value, Device, Index >::
+getValuesPerElement() const
+{
+   return valuesPerElement;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+void
+DistributedArrayView< Value, Device, Index >::
+startSynchronization()
+{
+   if( ghosts == 0 )
+      return;
+   // TODO: assert does not play very nice with automatic synchronizations from operations like
+   //       assignment of scalars
+   // (Maybe we should just drop all automatic syncs? But that's not nice for high-level codes
+   // like linear solvers...)
+   TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" );
+
+   typename SynchronizerType::ByteArrayView bytes;
+   bytes.bind( reinterpret_cast<std::uint8_t*>( localData.getData() ), sizeof(ValueType) * localData.getSize() );
+   synchronizer->synchronizeByteArrayAsync( bytes, sizeof(ValueType) * valuesPerElement );
+}
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
+waitForSynchronization() const
+{
+   if( synchronizer && synchronizer->async_op.valid() ) {
+      synchronizer->async_wait_timer.start();
+      synchronizer->async_op.wait();
+      synchronizer->async_wait_timer.stop();
+   }
+}
+
+
+template< typename Value,
+          typename Device,
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ViewType
+DistributedArrayView< Value, Device, Index >::
+getView()
+{
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+typename DistributedArrayView< Value, Device, Index >::ConstViewType
+DistributedArrayView< Value, Device, Index >::
+getConstView() const
+{
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+void
+DistributedArrayView< Value, Device, Index >::
 reset()
 {
    localRange.reset();
+   ghosts = 0;
    globalSize = 0;
-   group = Communicator::NullGroup;
+   group = MPI::NullGroup();
    localData.reset();
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 empty() const
 {
    return getSize() == 0;
@@ -213,10 +299,9 @@ empty() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 Index
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getSize() const
 {
    return globalSize;
@@ -224,21 +309,20 @@ getSize() const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 setValue( ValueType value )
 {
    localData.setValue( value );
+   startSynchronization();
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 setElement( IndexType i, ValueType value )
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -247,10 +331,9 @@ setElement( IndexType i, ValueType value )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 Value
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 getElement( IndexType i ) const
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -259,11 +342,10 @@ getElement( IndexType i ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 __cuda_callable__
 Value&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator[]( IndexType i )
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -272,11 +354,10 @@ operator[]( IndexType i )
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 __cuda_callable__
 const Value&
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator[]( IndexType i ) const
 {
    const IndexType li = localRange.getLocalIndex( i );
@@ -285,11 +366,47 @@ operator[]( IndexType i ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
+DistributedArrayView< Value, Device, Index >&
+DistributedArrayView< Value, Device, Index >::
+operator=( const DistributedArrayView& view )
+{
+   TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
+   localData = view.getConstLocalViewWithGhosts();
+   // set, but do not unset, the synchronizer
+   if( view.getSynchronizer() )
+      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+   template< typename Array, typename..., typename >
+DistributedArrayView< Value, Device, Index >&
+DistributedArrayView< Value, Device, Index >::
+operator=( const Array& array )
+{
+   TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
+   localData = array.getConstLocalViewWithGhosts();
+   // set, but do not unset, the synchronizer
+   if( array.getSynchronizer() )
+      setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
+   return *this;
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
    template< typename Array >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator==( const Array& array ) const
 {
    // we can't run allreduce if the communication groups are different
@@ -297,21 +414,22 @@ operator==( const Array& array ) const
       return false;
    const bool localResult =
          localRange == array.getLocalRange() &&
+         ghosts == array.getGhosts() &&
          globalSize == array.getSize() &&
-         localData == array.getConstLocalView();
+         // compare without ghosts
+         getConstLocalView() == array.getConstLocalView();
    bool result = true;
-   if( group != CommunicatorType::NullGroup )
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+   if( group != MPI::NullGroup() )
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
    return result;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Array >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 operator!=( const Array& array ) const
 {
    return ! (*this == array);
@@ -319,32 +437,30 @@ operator!=( const Array& array ) const
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 containsValue( ValueType value ) const
 {
    bool result = false;
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       const bool localResult = localData.containsValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group );
    }
    return result;
 }
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 bool
-DistributedArrayView< Value, Device, Index, Communicator >::
+DistributedArrayView< Value, Device, Index >::
 containsOnlyValue( ValueType value ) const
 {
    bool result = true;
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       const bool localResult = localData.containsOnlyValue( value );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
    }
    return result;
 }
diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 57b94a34b1bd7c210d24462aa1859cc68f087f15..c49e9e31b0250a333bf430e60612214e4d1585d0 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -12,34 +12,30 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/NDArray.h>
-#include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename NDArray,
-          typename Communicator = Communicators::MpiCommunicator,
           typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > >
 class DistributedNDArray
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = typename NDArray::ValueType;
    using DeviceType = typename NDArray::DeviceType;
    using IndexType = typename NDArray::IndexType;
+   using AllocatorType = typename NDArray::AllocatorType;
    using SizesHolderType = typename NDArray::SizesHolderType;
    using PermutationType = typename NDArray::PermutationType;
-   using CommunicatorType = Communicator;
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
    using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >;
 
-   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
-   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;
+   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Overlaps >;
    using LocalViewType = typename NDArray::ViewType;
    using ConstLocalViewType = typename NDArray::ConstViewType;
 
@@ -49,10 +45,17 @@ public:
 
    DistributedNDArray() = default;
 
-   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
-   // copy-constructor cannot be default. Actually, we most likely don't need
-   // it anyway, so let's just delete it.
-   DistributedNDArray( const DistributedNDArray& ) = delete;
+   DistributedNDArray( const AllocatorType& allocator );
+
+   // Copy constructor (makes a deep copy).
+   explicit DistributedNDArray( const DistributedNDArray& ) = default;
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit DistributedNDArray( const DistributedNDArray& other, const AllocatorType& allocator )
+   : localArray( allocator )
+   {
+      *this = other;
+   }
 
    // Standard copy-semantics with deep copy, just like regular 1D array.
    // Mismatched sizes cause reallocations.
@@ -79,8 +82,13 @@ public:
       return NDArray::getDimension();
    }
 
+   AllocatorType getAllocator() const
+   {
+      return localArray.getAllocator();
+   }
+
    __cuda_callable__
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -232,8 +240,8 @@ public:
             localEnds == other.localEnds &&
             localArray == other.localArray;
       bool result = true;
-      if( group != CommunicatorType::NullGroup )
-         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      if( group != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
       return result;
    }
 
@@ -375,7 +383,7 @@ public:
    }
 
    template< std::size_t level >
-   void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup )
+   void setDistribution( IndexType begin, IndexType end, MPI_Comm group = MPI::AllGroup() )
    {
       static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." );
       TNL_ASSERT_GE( begin, 0, "begin must be non-negative" );
@@ -383,7 +391,7 @@ public:
       TNL_ASSERT_LT( begin, end, "begin must be lesser than end" );
       localBegins.template setSize< level >( begin );
       localEnds.template setSize< level >( end );
-      TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group,
+      TNL_ASSERT( this->group == MPI::NullGroup() || this->group == group,
                   std::cerr << "different groups cannot be combined for different dimensions" );
       this->group = group;
    }
@@ -408,7 +416,7 @@ public:
    void reset()
    {
       localArray.reset();
-      group = CommunicatorType::NullGroup;
+      group = MPI::NullGroup();
       globalSizes = SizesHolderType{};
       localBegins = LocalBeginsType{};
       localEnds = SizesHolderType{};
@@ -435,7 +443,7 @@ public:
 
 protected:
    NDArray localArray;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    SizesHolderType globalSizes;
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
index bcec4a7b4760d9b864528f401f6ce68c7f3579f2..cea40bc21c3ec0c71ad891aafcfe620c9582754b 100644
--- a/src/TNL/Containers/DistributedNDArraySynchronizer.h
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -15,6 +15,7 @@
 #include <future>
 
 #include <TNL/Containers/ndarray/SynchronizerBuffers.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -69,7 +70,6 @@ public:
 
 protected:
    using DistributedNDArrayView = typename DistributedNDArray::ViewType;
-   using Communicator = typename DistributedNDArray::CommunicatorType;
    using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >;
 
    DistributedNDArrayView array_view;
@@ -88,12 +88,12 @@ protected:
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true );
 
       // issue all send and receive async operations
-      std::vector< typename Communicator::Request > requests;
-      const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+      std::vector< MPI_Request > requests;
+      const MPI_Comm group = array_view.getCommunicationGroup();
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group );
 
       // wait until send is done
-      Communicator::WaitAll( requests.data(), requests.size() );
+      MPI::Waitall( requests.data(), requests.size() );
 
       // copy data from receive buffers
       Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false );
@@ -152,9 +152,9 @@ protected:
          dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
 
          // FIXME: set proper neighbor IDs !!!
-         const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
-         const int rank = Communicator::GetRank(group);
-         const int nproc = Communicator::GetSize(group);
+         const MPI_Comm group = array_view.getCommunicationGroup();
+         const int rank = MPI::GetRank(group);
+         const int nproc = MPI::GetSize(group);
          dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
          dim_buffers.right_neighbor = (rank + 1) % nproc;
       }
@@ -221,32 +221,32 @@ protected:
          auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
          if( LBM_HACK == false ) {
-            requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData(),
-                                                     dim_buffers.left_send_view.getStorageSize(),
-                                                     dim_buffers.left_neighbor, 0, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData(),
-                                                     dim_buffers.left_recv_view.getStorageSize(),
-                                                     dim_buffers.left_neighbor, 1, group ) );
-            requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData(),
-                                                     dim_buffers.right_send_view.getStorageSize(),
-                                                     dim_buffers.right_neighbor, 1, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData(),
-                                                     dim_buffers.right_recv_view.getStorageSize(),
-                                                     dim_buffers.right_neighbor, 0, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
+                                            dim_buffers.left_send_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, 0, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
+                                            dim_buffers.left_recv_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, 1, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
+                                            dim_buffers.right_send_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
+                                            dim_buffers.right_recv_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, 0, group ) );
          }
          else {
-            requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData() + 0,
-                                                     dim_buffers.left_send_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.left_neighbor, 0, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                                     dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.left_neighbor, 1, group ) );
-            requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                                     dim_buffers.right_send_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.right_neighbor, 1, group ) );
-            requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData() + 0,
-                                                     dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
-                                                     dim_buffers.right_neighbor, 0, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
+                                            dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.left_neighbor, 0, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                            dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.left_neighbor, 1, group ) );
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                            dim_buffers.right_send_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.right_neighbor, 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
+                                            dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+                                            dim_buffers.right_neighbor, 0, group ) );
          }
       }
    };
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index 102985e9c15e4ff0d058dc79c04ff14b7ae2194b..4812bf5c006b24dc7ab201901338fcd8ae68337b 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -12,33 +12,30 @@
 
 #pragma once
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/NDArrayView.h>
 #include <TNL/Containers/Subrange.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
 
 template< typename NDArrayView,
-          typename Communicator = Communicators::MpiCommunicator,
           typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > >
 class DistributedNDArrayView
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using ValueType = typename NDArrayView::ValueType;
    using DeviceType = typename NDArrayView::DeviceType;
    using IndexType = typename NDArrayView::IndexType;
    using SizesHolderType = typename NDArrayView::SizesHolderType;
    using PermutationType = typename NDArrayView::PermutationType;
-   using CommunicatorType = Communicator;
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
    using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >;
 
-   using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
-   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;
+   using ViewType = DistributedNDArrayView< NDArrayView, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Overlaps >;
    using LocalViewType = NDArrayView;
    using ConstLocalViewType = typename NDArrayView::ConstViewType;
 
@@ -49,7 +46,7 @@ public:
 
    // explicit initialization by local array view, global sizes and local begins and ends
    __cuda_callable__
-   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group )
+   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, MPI_Comm group )
    : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {}
 
    // Copy-constructor does shallow copy, so views can be passed-by-value into
@@ -112,7 +109,7 @@ public:
    void reset()
    {
       localView.reset();
-      group = CommunicatorType::NullGroup;
+      group = MPI::NullGroup();
       globalSizes = SizesHolderType{};
       localBegins = LocalBeginsType{};
       localEnds = SizesHolderType{};
@@ -124,7 +121,7 @@ public:
    }
 
    __cuda_callable__
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -276,8 +273,8 @@ public:
             localEnds == other.localEnds &&
             localView == other.localView;
       bool result = true;
-      if( group != CommunicatorType::NullGroup )
-         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      if( group != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
       return result;
    }
 
@@ -406,7 +403,7 @@ public:
 
 protected:
    NDArrayView localView;
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    SizesHolderType globalSizes;
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 5d5f8303f520ac23171797f2cd240510e42f140c..8d737e3a975b5d4c91451bff93357f59b5864bbe 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -21,21 +21,20 @@ namespace Containers {
 template< typename Real,
           typename Device = Devices::Host,
           typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Real > >
 class DistributedVector
-: public DistributedArray< Real, Device, Index, Communicator >
+: public DistributedArray< Real, Device, Index, Allocator >
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using BaseType = DistributedArray< Real, Device, Index, Communicator >;
+   using BaseType = DistributedArray< Real, Device, Index, Allocator >;
 public:
    using RealType = Real;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
+   using AllocatorType = Allocator;
    using LocalViewType = Containers::VectorView< Real, Device, Index >;
    using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >;
-   using ViewType = DistributedVectorView< Real, Device, Index, Communicator >;
-   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >;
+   using ViewType = DistributedVectorView< Real, Device, Index >;
+   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref Vector type with changed template parameters.
@@ -43,8 +42,8 @@ public:
    template< typename _Real,
              typename _Device = Device,
              typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedVector< _Real, _Device, _Index, _Communicator >;
+             typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Real > >
+   using Self = DistributedVector< _Real, _Device, _Index, _Allocator >;
 
 
    // inherit all constructors and assignment operators from Array
@@ -60,6 +59,11 @@ public:
     */
    explicit DistributedVector( const DistributedVector& ) = default;
 
+   /**
+    * \brief Copy constructor with a specific allocator (makes a deep copy).
+    */
+   explicit DistributedVector( const DistributedVector& vector, const AllocatorType& allocator );
+
    /**
     * \brief Default move constructor.
     */
@@ -75,11 +79,28 @@ public:
     */
    DistributedVector& operator=( DistributedVector&& ) = default;
 
-   // we return only the view so that the user cannot resize it
+   /**
+    * \brief Returns a modifiable view of the local part of the vector.
+    */
    LocalViewType getLocalView();
 
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector.
+    */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    /**
     * \brief Returns a modifiable view of the vector.
     */
@@ -160,8 +181,8 @@ public:
 
 // Enable expression templates for DistributedVector
 namespace Expressions {
-   template< typename Real, typename Device, typename Index, typename Communicator >
-   struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Communicator > >
+   template< typename Real, typename Device, typename Index, typename Allocator >
+   struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Allocator > >
    : std::true_type
    {};
 } // namespace Expressions
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index fa49591e8ae53ffd06214772491c656b91601413..044b747d9f42d148b17b1acb30917b5cdf04887c 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -21,9 +21,19 @@ namespace Containers {
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Real, Device, Index, Allocator >::
+DistributedVector( const DistributedVector& vector, const AllocatorType& allocator )
+: BaseType::DistributedArray( vector, allocator )
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getLocalView()
 {
    return BaseType::getLocalView();
@@ -32,41 +42,63 @@ getLocalView()
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVector< Real, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
 getConstLocalView() const
 {
    return BaseType::getConstLocalView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
+getLocalViewWithGhosts()
+{
+   return BaseType::getLocalViewWithGhosts();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType
+DistributedVector< Real, Device, Index, Allocator >::
+getConstLocalViewWithGhosts() const
+{
+   return BaseType::getConstLocalViewWithGhosts();
+}
+
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Value, Device, Index, Communicator >::ViewType
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Value, Device, Index, Allocator >::ViewType
+DistributedVector< Value, Device, Index, Allocator >::
 getView()
 {
-   return ViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getLocalView() );
+   return BaseType::getView();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+typename DistributedVector< Value, Device, Index, Allocator >::ConstViewType
+DistributedVector< Value, Device, Index, Allocator >::
 getConstView() const
 {
-   return ConstViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getConstLocalView() );
+   return BaseType::getConstView();
 }
 
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Value, Device, Index, Allocator >::
 operator ViewType()
 {
    return getView();
@@ -75,8 +107,8 @@ operator ViewType()
 template< typename Value,
           typename Device,
           typename Index,
-          typename Communicator >
-DistributedVector< Value, Device, Index, Communicator >::
+          typename Allocator >
+DistributedVector< Value, Device, Index, Allocator >::
 operator ConstViewType() const
 {
    return getConstView();
@@ -90,194 +122,144 @@ operator ConstViewType() const
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator=( const Vector& vector )
 {
    this->setLike( vector );
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = vector.getConstLocalView();
-   }
+   getView() = vector;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator+=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += vector.getConstLocalView();
-   }
+   getView() += vector;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator-=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= vector.getConstLocalView();
-   }
+   getView() -= vector;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator*=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= vector.getConstLocalView();
-   }
+   getView() *= vector;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Vector, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator/=( const Vector& vector )
 {
-   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
-                  "Vector sizes must be equal." );
-   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
-                  "Multiary operations are supported only on vectors which are distributed the same way." );
-   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
-                  "Multiary operations are supported only on vectors within the same communication group." );
-
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= vector.getConstLocalView();
-   }
+   getView() /= vector;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = c;
-   }
+   getView() = c;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator+=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += c;
-   }
+   getView() += c;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator-=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= c;
-   }
+   getView() -= c;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator*=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= c;
-   }
+   getView() *= c;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< typename Scalar, typename..., typename >
-DistributedVector< Real, Device, Index, Communicator >&
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
 operator/=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= c;
-   }
+   getView() /= c;
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index,
-          typename Communicator >
+          typename Allocator >
    template< Algorithms::ScanType Type >
 void
-DistributedVector< Real, Device, Index, Communicator >::
+DistributedVector< Real, Device, Index, Allocator >::
 scan( IndexType begin, IndexType end )
 {
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
+   getView().template scan< Type >( begin, end );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 157a64b94d64da3ccad3ce81606f0708faa608c2..4a46a47cec4eba56de0e6078ccc4557a52d37177 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -21,32 +21,28 @@ namespace Containers {
 
 template< typename Real,
           typename Device = Devices::Host,
-          typename Index = int,
-          typename Communicator = Communicators::MpiCommunicator >
+          typename Index = int >
 class DistributedVectorView
-: public DistributedArrayView< Real, Device, Index, Communicator >
+: public DistributedArrayView< Real, Device, Index >
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
-   using BaseType = DistributedArrayView< Real, Device, Index, Communicator >;
+   using BaseType = DistributedArrayView< Real, Device, Index >;
    using NonConstReal = typename std::remove_const< Real >::type;
 public:
    using RealType = Real;
    using DeviceType = Device;
-   using CommunicatorType = Communicator;
    using IndexType = Index;
    using LocalViewType = Containers::VectorView< Real, Device, Index >;
    using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >;
-   using ViewType = DistributedVectorView< Real, Device, Index, Communicator >;
-   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >;
+   using ViewType = DistributedVectorView< Real, Device, Index >;
+   using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >;
 
    /**
     * \brief A template which allows to quickly obtain a \ref VectorView type with changed template parameters.
     */
    template< typename _Real,
              typename _Device = Device,
-             typename _Index = Index,
-             typename _Communicator = Communicator >
-   using Self = DistributedVectorView< _Real, _Device, _Index, _Communicator >;
+             typename _Index = Index >
+   using Self = DistributedVectorView< _Real, _Device, _Index >;
 
 
    // inherit all constructors and assignment operators from ArrayView
@@ -58,29 +54,43 @@ public:
    // In C++14, default constructors cannot be inherited, although Clang
    // and GCC since version 7.0 inherit them.
    // https://stackoverflow.com/a/51854172
-   __cuda_callable__
    DistributedVectorView() = default;
 
    // initialization by base class is not a copy constructor so it has to be explicit
    template< typename Real_ >  // template catches both const and non-const qualified Element
-   __cuda_callable__
-   DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view )
+   DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index >& view )
    : BaseType( view ) {}
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector.
+    */
    LocalViewType getLocalView();
 
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector.
+    */
    ConstLocalViewType getConstLocalView() const;
 
+   /**
+    * \brief Returns a modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   LocalViewType getLocalViewWithGhosts();
+
+   /**
+    * \brief Returns a non-modifiable view of the local part of the vector,
+    * including ghost values.
+    */
+   ConstLocalViewType getConstLocalViewWithGhosts() const;
+
    /**
     * \brief Returns a modifiable view of the array view.
     */
-   __cuda_callable__
    ViewType getView();
 
    /**
     * \brief Returns a non-modifiable view of the array view.
     */
-   __cuda_callable__
    ConstViewType getConstView() const;
 
    /*
@@ -142,8 +152,8 @@ public:
 
 // Enable expression templates for DistributedVector
 namespace Expressions {
-   template< typename Real, typename Device, typename Index, typename Communicator >
-   struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index, Communicator > >
+   template< typename Real, typename Device, typename Index >
+   struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index > >
    : std::true_type
    {};
 } // namespace Expressions
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 70f61979fd44fb8d3f9d1878eb2c4a6ecd5c169b..2f9222f94efb579d3a39c803d5685283fee03b33 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -20,10 +20,9 @@ namespace Containers {
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::LocalViewType
+DistributedVectorView< Real, Device, Index >::
 getLocalView()
 {
    return BaseType::getLocalView();
@@ -31,22 +30,39 @@ getLocalView()
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType
-DistributedVectorView< Real, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType
+DistributedVectorView< Real, Device, Index >::
 getConstLocalView() const
 {
    return BaseType::getConstLocalView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::LocalViewType
+DistributedVectorView< Real, Device, Index >::
+getLocalViewWithGhosts()
+{
+   return BaseType::getLocalViewWithGhosts();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType
+DistributedVectorView< Real, Device, Index >::
+getConstLocalViewWithGhosts() const
+{
+   return BaseType::getConstLocalViewWithGhosts();
+}
+
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-__cuda_callable__
-typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType
-DistributedVectorView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Value, Device, Index >::ViewType
+DistributedVectorView< Value, Device, Index >::
 getView()
 {
    return *this;
@@ -54,11 +70,9 @@ getView()
 
 template< typename Value,
           typename Device,
-          typename Index,
-          typename Communicator >
-__cuda_callable__
-typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType
-DistributedVectorView< Value, Device, Index, Communicator >::
+          typename Index >
+typename DistributedVectorView< Value, Device, Index >::ConstViewType
+DistributedVectorView< Value, Device, Index >::
 getConstView() const
 {
    return *this;
@@ -71,201 +85,221 @@ getConstView() const
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
                   "The sizes of the array views must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "The local ranges must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "The communication groups of the array views must be equal." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() = vector.getConstLocalView();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() = vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator+=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() += vector.getConstLocalView();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() += vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator-=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() -= vector.getConstLocalView();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() -= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator*=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() *= vector.getConstLocalView();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() *= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Vector, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator/=( const Vector& vector )
 {
    TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
                   "Vector sizes must be equal." );
    TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
                   "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
                   "Multiary operations are supported only on vectors within the same communication group." );
 
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      getLocalView() /= vector.getConstLocalView();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() /= vector.getConstLocalViewWithGhosts();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() = c;
+      this->startSynchronization();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator+=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() += c;
+      this->startSynchronization();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator-=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() -= c;
+      this->startSynchronization();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator*=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() *= c;
+      this->startSynchronization();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< typename Scalar, typename..., typename >
-DistributedVectorView< Real, Device, Index, Communicator >&
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
 operator/=( Scalar c )
 {
-   if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
       getLocalView() /= c;
+      this->startSynchronization();
    }
    return *this;
 }
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
    template< Algorithms::ScanType Type >
 void
-DistributedVectorView< Real, Device, Index, Communicator >::
+DistributedVectorView< Real, Device, Index >::
 scan( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
    Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
+   this->startSynchronization();
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h
index 4cecc92bb9b3823db92df893c289c0233d26bb14..10bf2d117ab3e740b6bc3aeebd1b1506688851a3 100644
--- a/src/TNL/Containers/Expressions/DistributedComparison.h
+++ b/src/TNL/Containers/Expressions/DistributedComparison.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -38,11 +38,13 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
          return false;
       const bool localResult =
             a.getLocalRange() == b.getLocalRange() &&
+            a.getGhosts() == b.getGhosts() &&
             a.getSize() == b.getSize() &&
+            // compare without ghosts
             a.getConstLocalView() == b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -55,14 +57,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
          return false;
       const bool localResult = a.getConstLocalView() < b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -70,14 +73,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
          return false;
       const bool localResult = a.getConstLocalView() <= b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -85,14 +89,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
          return false;
       const bool localResult = a.getConstLocalView() > b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -100,14 +105,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression
    {
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." );
       TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." );
+      TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." );
 
       // we can't run allreduce if the communication groups are different
       if( a.getCommunicationGroup() != b.getCommunicationGroup() )
          return false;
       const bool localResult = a.getConstLocalView() >= b.getConstLocalView();
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 };
@@ -122,8 +128,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a == b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -136,8 +142,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a < b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -145,8 +151,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a <= b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -154,8 +160,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a > b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 
@@ -163,8 +169,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab
    {
       const bool localResult = a >= b.getConstLocalView();
       bool result = true;
-      if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup )
-         T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
+      if( b.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() );
       return result;
    }
 };
@@ -179,8 +185,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() == b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -193,8 +199,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() < b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -202,8 +208,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() <= b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -211,8 +217,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() > b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 
@@ -220,8 +226,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab
    {
       const bool localResult = a.getConstLocalView() >= b;
       bool result = true;
-      if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup )
-         T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
+      if( a.getCommunicationGroup() != MPI::NullGroup() )
+         MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() );
       return result;
    }
 };
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 1802dcc9553e81c7b31c0e792d6d368a4e743c8d..5f67084fd8f3e21dd84ff165625cc1186386dd9b 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -10,6 +10,7 @@
 
 #pragma once
 #include <utility>
+#include <memory>
 
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
 #include <TNL/Containers/Expressions/DistributedComparison.h>
@@ -58,12 +59,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType,
                                                         typename T2::ConstLocalViewType,
                                                         Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." );
@@ -79,13 +79,16 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
                      "Attempt to mix operands with different sizes." );
       TNL_ASSERT_EQ( op1.getLocalRange(), op2.getLocalRange(),
                      "Distributed expressions are supported only on vectors which are distributed the same way." );
+      TNL_ASSERT_EQ( op1.getGhosts(), op2.getGhosts(),
+                     "Distributed expressions are supported only on vectors which are distributed the same way." );
       TNL_ASSERT_EQ( op1.getCommunicationGroup(), op2.getCommunicationGroup(),
                      "Distributed expressions are supported only on vectors within the same communication group." );
    }
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -105,7 +108,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getLocalRange();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   IndexType getGhosts() const
+   {
+      return op1.getGhosts();
+   }
+
+   MPI_Comm getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
    }
@@ -115,6 +123,27 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return ConstLocalViewType( op1.getConstLocalView(), op2.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op1.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op1.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op1.waitForSynchronization();
+      op2.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -128,10 +157,9 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." );
@@ -141,7 +169,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -161,7 +190,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return op1.getLocalRange();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   IndexType getGhosts() const
+   {
+      return op1.getGhosts();
+   }
+
+   MPI_Comm getCommunicationGroup() const
    {
       return op1.getCommunicationGroup();
    }
@@ -171,6 +205,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV
       return ConstLocalViewType( op1.getConstLocalView(), op2 );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2 );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op1.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op1.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op1.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -184,10 +238,9 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
    using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
-   using CommunicatorType = typename T2::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T2::LocalRangeType;
    using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >;
+   using SynchronizerType = typename T2::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T2 >::value,
                   "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the right operand." );
@@ -197,7 +250,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -217,7 +271,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
       return op2.getLocalRange();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   IndexType getGhosts() const
+   {
+      return op2.getGhosts();
+   }
+
+   MPI_Comm getCommunicationGroup() const
    {
       return op2.getCommunicationGroup();
    }
@@ -227,6 +286,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl
       return ConstLocalViewType( op1, op2.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( op1, op2.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return op2.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return op2.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      op2.waitForSynchronization();
+   }
+
 protected:
    const T1& op1;
    const T2& op2;
@@ -241,10 +320,9 @@ struct DistributedUnaryExpressionTemplate
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
-   using CommunicatorType = typename T1::CommunicatorType;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = typename T1::LocalRangeType;
    using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >;
+   using SynchronizerType = typename T1::SynchronizerType;
 
    static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value,
                   "Invalid operand in distributed unary expression templates - distributed expression templates are not enabled for the operand." );
@@ -254,7 +332,8 @@ struct DistributedUnaryExpressionTemplate
 
    RealType getElement( const IndexType i ) const
    {
-      return getConstLocalView().getElement( i );
+      const IndexType li = getLocalRange().getLocalIndex( i );
+      return getConstLocalView().getElement( li );
    }
 
    // this is actually never executed, but needed for proper ExpressionVariableTypeGetter
@@ -274,7 +353,12 @@ struct DistributedUnaryExpressionTemplate
       return operand.getLocalRange();
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   IndexType getGhosts() const
+   {
+      return operand.getGhosts();
+   }
+
+   MPI_Comm getCommunicationGroup() const
    {
       return operand.getCommunicationGroup();
    }
@@ -284,6 +368,26 @@ struct DistributedUnaryExpressionTemplate
       return ConstLocalViewType( operand.getConstLocalView() );
    }
 
+   ConstLocalViewType getConstLocalViewWithGhosts() const
+   {
+      return ConstLocalViewType( operand.getConstLocalViewWithGhosts() );
+   }
+
+   std::shared_ptr< SynchronizerType > getSynchronizer() const
+   {
+      return operand.getSynchronizer();
+   }
+
+   int getValuesPerElement() const
+   {
+      return operand.getValuesPerElement();
+   }
+
+   void waitForSynchronization() const
+   {
+      operand.waitForSynchronization();
+   }
+
 protected:
    const T1& operand;
 };
@@ -812,10 +916,19 @@ template< typename T1,
           typename Operation >
 std::ostream& operator<<( std::ostream& str, const DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression )
 {
+   const auto localRange = expression.getLocalRange();
    str << "[ ";
-   for( int i = 0; i < expression.getSize() - 1; i++ )
+   for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ )
       str << expression.getElement( i ) << ", ";
-   str << expression.getElement( expression.getSize() - 1 ) << " ]";
+   str << expression.getElement( localRange.getEnd() - 1 );
+   if( expression.getGhosts() > 0 ) {
+      str << " | ";
+      const auto localView = expression.getConstLocalViewWithGhosts();
+      for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ )
+         str << localView.getElement( i ) << ", ";
+      str << localView.getElement( localView.getSize() - 1 );
+   }
+   str << " ]";
    return str;
 }
 
@@ -823,10 +936,19 @@ template< typename T,
           typename Operation >
 std::ostream& operator<<( std::ostream& str, const DistributedUnaryExpressionTemplate< T, Operation >& expression )
 {
+   const auto localRange = expression.getLocalRange();
    str << "[ ";
-   for( int i = 0; i < expression.getSize() - 1; i++ )
+   for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ )
       str << expression.getElement( i ) << ", ";
-   str << expression.getElement( expression.getSize() - 1 ) << " ]";
+   str << expression.getElement( localRange.getEnd() - 1 );
+   if( expression.getGhosts() > 0 ) {
+      str << " | ";
+      const auto localView = expression.getConstLocalViewWithGhosts();
+      for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ )
+         str << localView.getElement( i ) << ", ";
+      str << localView.getElement( localView.getSize() - 1 );
+   }
+   str << " ]";
    return str;
 }
 
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index b525e8a5398001998955f2cdcfa40fdcb0891b05..903df1e1dd23ac9e9d0b5193f57760c3d3a9d710 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Expressions/VerticalOperations.h>
-#include <TNL/Communicators/MpiDefs.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Containers {
@@ -21,14 +21,13 @@ template< typename Expression >
 auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -40,26 +39,25 @@ auto DistributedExpressionArgMin( const Expression& expression )
    using RealType = std::decay_t< decltype( expression[0] ) >;
    using IndexType = typename Expression::IndexType;
    using ResultType = std::pair< RealType, IndexType >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< RealType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's real type" );
    ResultType result( -1, std::numeric_limits< RealType >::max() );
    const auto group = expression.getCommunicationGroup();
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       // compute local argMin
       ResultType localResult = ExpressionArgMin( expression.getConstLocalView() );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
       // scatter local result to all processes and gather their results
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       ResultType dataForScatter[ nproc ];
       for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult;
       ResultType gatheredResults[ nproc ];
       // NOTE: exchanging general data types does not work with MPI
-      //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
-      CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
+      //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
+      MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
 
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
@@ -82,14 +80,13 @@ template< typename Expression >
 auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::lowest();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -101,26 +98,25 @@ auto DistributedExpressionArgMax( const Expression& expression )
    using RealType = std::decay_t< decltype( expression[0] ) >;
    using IndexType = typename Expression::IndexType;
    using ResultType = std::pair< RealType, IndexType >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< RealType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's real type" );
    ResultType result( -1, std::numeric_limits< RealType >::lowest() );
    const auto group = expression.getCommunicationGroup();
-   if( group != CommunicatorType::NullGroup ) {
+   if( group != MPI::NullGroup() ) {
       // compute local argMax
       ResultType localResult = ExpressionArgMax( expression.getConstLocalView() );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
       // scatter local result to all processes and gather their results
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       ResultType dataForScatter[ nproc ];
       for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult;
       ResultType gatheredResults[ nproc ];
       // NOTE: exchanging general data types does not work with MPI
-      //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
-      CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
+      //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group );
+      MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group );
 
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
@@ -143,12 +139,11 @@ template< typename Expression >
 auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionSum( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -157,12 +152,11 @@ template< typename Expression >
 auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_t< decltype( expression[0] * expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 1;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionProduct( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -171,14 +165,13 @@ template< typename Expression >
 auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] && expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -187,12 +180,11 @@ template< typename Expression >
 auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] || expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] || expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -201,14 +193,13 @@ template< typename Expression >
 auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
    }
    return result;
 }
@@ -217,12 +208,11 @@ template< typename Expression >
 auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] | expression[0] ) >;
-   using CommunicatorType = typename Expression::CommunicatorType;
 
    ResultType result = 0;
-   if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
       const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() );
-      CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
    }
    return result;
 }
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 7b8a2f31c388f3e1836a86a8579b0a463cb710a7..f8ba157ba6ce1e8fc85c4b9a28526808e8bb2597 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -59,10 +59,8 @@ public:
 
    NDArrayStorage() = default;
 
-   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
-   // copy-constructor cannot be default. Actually, we most likely don't need
-   // it anyway, so let's just delete it.
-   NDArrayStorage( const NDArrayStorage& ) = delete;
+   // Copy constructor (makes a deep copy).
+   explicit NDArrayStorage( const NDArrayStorage& ) = default;
 
    // Standard copy-semantics with deep copy, just like regular 1D array.
    // Mismatched sizes cause reallocations.
@@ -326,21 +324,49 @@ template< typename Value,
           typename SizesHolder,
           typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
           typename Device = Devices::Host,
-          typename Index = typename SizesHolder::IndexType >
+          typename Index = typename SizesHolder::IndexType,
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class NDArray
-: public NDArrayStorage< Array< Value, Device, Index >,
+: public NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
 {
-   using Base = NDArrayStorage< Array< Value, Device, Index >,
+   using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;
 
 public:
-   // inherit all assignment operators
+   // inherit all constructors and assignment operators
+   using Base::Base;
    using Base::operator=;
+
+   // default constructor
+   NDArray() = default;
+
+   // implement dynamic array interface
+   using AllocatorType = Allocator;
+
+   NDArray( const NDArray& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator );
+   }
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit NDArray( const NDArray& other, const AllocatorType& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->array = Array< Value, Device, Index, Allocator >( allocator );
+      // copy the data
+      *this = other;
+   }
+
+   AllocatorType getAllocator() const
+   {
+      return this->array.getAllocator();
+   }
 };
 
 template< typename Value,
@@ -372,21 +398,49 @@ template< typename Value,
           typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
           typename SliceInfo = SliceInfo<>,  // no slicing by default
           typename Device = Devices::Host,
-          typename Index = typename SizesHolder::IndexType >
+          typename Index = typename SizesHolder::IndexType,
+          typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > >
 class SlicedNDArray
-: public NDArrayStorage< Array< Value, Device, Index >,
+: public NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::SlicedNDArrayBase< SliceInfo > >
 {
-   using Base = NDArrayStorage< Array< Value, Device, Index >,
+   using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::SlicedNDArrayBase< SliceInfo > >;
 
 public:
-   // inherit all assignment operators
+   // inherit all constructors and assignment operators
+   using Base::Base;
    using Base::operator=;
+
+   // default constructor
+   SlicedNDArray() = default;
+
+   // implement dynamic array interface
+   using AllocatorType = Allocator;
+
+   SlicedNDArray( const SlicedNDArray& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator );
+   }
+
+   // Copy constructor with a specific allocator (makes a deep copy).
+   explicit SlicedNDArray( const SlicedNDArray& other, const AllocatorType& allocator )
+   {
+      // set empty array containing the specified allocator
+      this->array = Array< Value, Device, Index, Allocator >( allocator );
+      // copy the data
+      *this = other;
+   }
+
+   AllocatorType getAllocator() const
+   {
+      return this->array.getAllocator();
+   }
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h
index f0b50747599fad82e9dff3ca21f3cf4944782cf1..6d3605b5a7449faa9664a2e641d6a32cf92e5659 100644
--- a/src/TNL/Containers/Partitioner.h
+++ b/src/TNL/Containers/Partitioner.h
@@ -12,25 +12,27 @@
 
 #pragma once
 
+#include <vector>
+
 #include "Subrange.h"
+#include "ByteArraySynchronizer.h"
 
 #include <TNL/Math.h>
 
 namespace TNL {
 namespace Containers {
 
-template< typename Index, typename Communicator >
+template< typename Index >
 class Partitioner
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using SubrangeType = Subrange< Index >;
 
-   static SubrangeType splitRange( Index globalSize, CommunicationGroup group )
+   static SubrangeType splitRange( Index globalSize, MPI_Comm group )
    {
-      if( group != Communicator::NullGroup ) {
-         const int rank = Communicator::GetRank( group );
-         const int partitions = Communicator::GetSize( group );
+      if( group != MPI::NullGroup() ) {
+         const int rank = MPI::GetRank( group );
+         const int partitions = MPI::GetSize( group );
          const Index begin = TNL::min( globalSize, rank * globalSize / partitions );
          const Index end = TNL::min( globalSize, (rank + 1) * globalSize / partitions );
          return SubrangeType( begin, end );
@@ -66,13 +68,77 @@ public:
       const Index end = min( globalSize, (rank + 1) * globalSize / partitions );
       return end - begin;
    }
-};
 
-// TODO:
-// - partitioner in deal.II stores also ghost indices:
-//   https://www.dealii.org/8.4.0/doxygen/deal.II/classUtilities_1_1MPI_1_1Partitioner.html
-// - ghost indices are stored in a general IndexMap class (based on collection of subranges):
-//   https://www.dealii.org/8.4.0/doxygen/deal.II/classIndexSet.html
+   template< typename Device >
+   class ArraySynchronizer
+   : public ByteArraySynchronizer< Device, Index >
+   {
+      using Base = ByteArraySynchronizer< Device, Index >;
+
+      SubrangeType localRange;
+      int overlaps;
+      MPI_Comm group;
+
+   public:
+      using ByteArrayView = typename Base::ByteArrayView;
+      using RequestsVector = typename Base::RequestsVector;
+
+      ~ArraySynchronizer()
+      {
+         // wait for pending async operation, otherwise it would crash
+         if( this->async_op.valid() )
+            this->async_op.wait();
+      }
+
+      ArraySynchronizer() = delete;
+
+      ArraySynchronizer( SubrangeType localRange, int overlaps, MPI_Comm group )
+      : localRange(localRange), overlaps(overlaps), group(group)
+      {}
+
+      virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
+      {
+         auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+         MPI::Waitall( requests.data(), requests.size() );
+      }
+
+      virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
+      {
+         TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps),
+                        "unexpected array size" );
+
+         const int rank = MPI::GetRank( group );
+         const int nproc = MPI::GetSize( group );
+         const int left = (rank > 0) ? rank - 1 : nproc - 1;
+         const int right = (rank < nproc - 1) ? rank + 1 : 0;
+
+         // buffer for asynchronous communication requests
+         std::vector< MPI_Request > requests;
+
+         // issue all async receive operations
+         requests.push_back( MPI::Irecv(
+                  array.getData() + bytesPerValue * localRange.getSize(),
+                  bytesPerValue * overlaps,
+                  left, 0, group ) );
+         requests.push_back( MPI::Irecv(
+                  array.getData() + bytesPerValue * (localRange.getSize() + overlaps),
+                  bytesPerValue * overlaps,
+                  right, 0, group ) );
+
+         // issue all async send operations
+         requests.push_back( MPI::Isend(
+                  array.getData(),
+                  bytesPerValue * overlaps,
+                  left, 0, group ) );
+         requests.push_back( MPI::Isend(
+                  array.getData() + bytesPerValue * (localRange.getSize() - overlaps),
+                  bytesPerValue * overlaps,
+                  right, 0, group ) );
+
+         return requests;
+      }
+   };
+};
 
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Exceptions/MPIDimsCreateError.h b/src/TNL/Exceptions/MPIDimsCreateError.h
deleted file mode 100644
index 1cb1a8f2e61abedb6b7798828918a5f118d9fd89..0000000000000000000000000000000000000000
--- a/src/TNL/Exceptions/MPIDimsCreateError.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
-                          MPIDimsCreateError.h  -  description
-                             -------------------
-    begin                : Jan 30, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <stdexcept>
-
-namespace TNL {
-namespace Exceptions {
-
-struct MPIDimsCreateError
-   : public std::runtime_error
-{
-   MPIDimsCreateError()
-   : std::runtime_error( "The program tries to call MPI_Dims_create with wrong dimensions."
-                         "Non of the dimensions is zero and product of all dimensions does not fit with number of MPI processes." )
-   {}
-};
-
-} // namespace Exceptions
-} // namespace TNL
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 7bfeb4976f46bc98e70454e16b63b94a26bd6dab..3e1ea757b9dd656de17a5fe224695b99e3791e6d 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod_impl.h
  * Author: oberhuber
  *
@@ -25,7 +25,7 @@ String
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 getType()
 {
-   return String( "DirectEikonalProblem< " + 
+   return String( "DirectEikonalProblem< " +
                   Mesh::getType() + ", " +
                   Anisotropy::getType() + ", " +
                   Real::getType() + ", " +
@@ -54,7 +54,7 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 writeProlog( Logger& logger,
              const Config::ParameterContainer& parameters ) const
 {
-   
+
 }
 
 template< typename Mesh,
@@ -123,7 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( dofs );
   String inputFile = parameters.getParameter< String >( "input-file" );
-  this->initialData->setMesh( this->getMesh() ); 
+  this->initialData->setMesh( this->getMesh() );
   if( CommunicatorType::isDistributed() )
   {
     std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
@@ -132,7 +132,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
     if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
       Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *initialData );
     synchronizer.setDistributedGrid( initialData->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *initialData );
+    synchronizer.synchronize( *initialData );
   }
   else
   {
@@ -190,7 +190,7 @@ solve( DofVectorPointer& dofs )
 {
    FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
    fsm.solve( this->getMesh(), u, anisotropy, initialData );
-   
+
    makeSnapshot();
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index a1ca740e4ba7743935ca34797ae21a532a47ac8f..14a52ec40cb5349b741e2880a9474b5ff2b210d9 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod2D_impl.h
  * Author: oberhuber
  *
@@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisot
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-  
+
 }
 
 template< typename Real,
@@ -36,7 +36,7 @@ const Index&
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
-  
+
 }
 
 template< typename Real,
@@ -48,68 +48,68 @@ void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-  
+
 }
 
 template< typename Real,
         typename Device,
         typename Index,
         typename Communicator,
-        typename Anisotropy > 
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
         MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
         const MeshFunctionPointer& u )
-{  
+{
   MeshFunctionPointer auxPtr;
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
-  
+
   // Setting overlaps ( WITHOUT MPI SHOULD BE 0 )
   StaticVector vecLowerOverlaps, vecUpperOverlaps;
   setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
-  
+
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
-  
+
   //auxPtr->save( "aux-ini.tnl" );
-  
+
   typename MeshType::Cell cell( *mesh );
-  
+
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
   synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() );
-  synchronizer.template synchronize< Communicator >( aux ); //synchronize initialized overlaps
-  
-  std::cout << "Calculating the values ..." << std::endl; 
+  synchronizer.synchronize( aux ); //synchronize initialized overlaps
+
+  std::cout << "Calculating the values ..." << std::endl;
   while( iteration < this->maxIterations )
   {
-    // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
-    // calculatedBefore is same for all ranks 
+    // calculatedBefore indicates weather we calculated in the last passage of the while cycle
+    // calculatedBefore is same for all ranks
     // without MPI should be FALSE at the end of while cycle body
     int calculatedBefore = 1;
-    
+
     // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle
     // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
-    int calculateMPIAgain = 1;  
-    
+    int calculateMPIAgain = 1;
+
     while( calculatedBefore )
     {
       calculatedBefore = 0;
-      
+
       if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
         calculateMPIAgain = 0;
-        
+
   /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
         /*
          int numThreadsPerBlock = -1;
-         
+
          numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
          if( numThreadsPerBlock <= 16 )
@@ -127,28 +127,28 @@ solve( const MeshPointer& mesh,
          else
          numThreadsPerBlock = 1024;
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-         
+
          if( numThreadsPerBlock == -1 ){
          printf("Fail in setting numThreadsPerBlock.\n");
          break;
          }
-         
-         
-         
+
+
+
          int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-         
+
          //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-         
+
          //Real **sArray = new Real*[numBlocksX*numBlocksY];
          //for( int i = 0; i < numBlocksX * numBlocksY; i++ )
          // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
-         
+
          ArrayContainer BlockIterHost;
          BlockIterHost.setSize( numBlocksX * numBlocksY );
          BlockIterHost.setValue( 1 );
          int IsCalculationDone = 1;
-         
+
          MeshFunctionPointer helpFunc( mesh );
          MeshFunctionPointer helpFunc1( mesh );
          helpFunc1 = auxPtr;
@@ -164,7 +164,7 @@ solve( const MeshPointer& mesh,
          // std::cout<<std::endl;
          unsigned int numWhile = 0;
          while( IsCalculationDone )
-         {      
+         {
          IsCalculationDone = 0;
          helpFunc1 = auxPtr;
          auxPtr = helpFunc;
@@ -185,9 +185,9 @@ solve( const MeshPointer& mesh,
          default:
          this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
          }
-         
-         
-         //Reduction      
+
+
+         //Reduction
          for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
          IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -196,16 +196,16 @@ solve( const MeshPointer& mesh,
          }
          numWhile++;
          //std::cout <<"numWhile = "<< numWhile <<std::endl;
-         
+
          // for( int j = numBlocksY-1; j>-1; j-- ){
          // for( int i = 0; i < numBlocksX; i++ )
          // std::cout << BlockIterHost[ j * numBlocksX + i ];
          // std::cout << std::endl;
          // }
          // std::cout << std::endl;
-         
+
          this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
-         
+
          //std::cout<<std::endl;
          //String s( "aux-"+ std::to_string(numWhile) + ".tnl");
          //aux.save( s );
@@ -215,8 +215,8 @@ solve( const MeshPointer& mesh,
          }
          */
   /**-END-OF-OMP-PARALLEL------------------------------------------------**/
-        
-        
+
+
   // FSM FOR MPI and WITHOUT MPI
         StaticVector boundsFrom; StaticVector boundsTo;
     // UP and RIGHT
@@ -224,75 +224,75 @@ solve( const MeshPointer& mesh,
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save("aux-1.tnl");
-        
+
     // UP and LEFL
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-2.tnl" );
-        
+
     // DOWN and RIGHT
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-3.tnl" );
-        
+
     // DOWN and LEFT
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
+
       }
       if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA?
       {
         calculateMPIAgain = 0;
-          
+
 #ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
         // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 16 );
-        
+
         // Setting number of threads and blocks for kernel
         int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
         int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
         dim3 blockSize( cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY );
-        
+
         // Need for calling functions from kernel
         BaseType ptr;
-        
+
         // True if we should calculate again.
         int calculateCudaBlocksAgain = 1;
-        
+
         // Array that identifies which blocks should be calculated.
         // All blocks should calculate in first passage ( setValue(1) )
         TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY );
         blockCalculationIndicator.setValue( 1 );
         TNL_CHECK_CUDA_DEVICE;
-        
+
         // Array into which we identify the neighbours and then copy it into blockCalculationIndicator
         TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY );
         blockCalculationIndicatorHelp.setValue( 0 );
-        
+
         // number of Blocks for kernel that calculates neighbours.
         int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        
+
         // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         Containers::Vector< RealType, DeviceType, IndexType > helpVec;
         helpVec.setLike( auxPtr.template getData().getData() );
         MeshFunctionPointer helpFunc;
         helpFunc->bind( mesh, helpVec );
-        helpFunc.template modifyData() = auxPtr.template getData(); 
-        
+        helpFunc.template modifyData() = auxPtr.template getData();
+
         // number of iterations of while calculateCudaBlocksAgain
         int numIter = 0;
-               
+
         //int oddEvenBlock = 0;
         while( calculateCudaBlocksAgain )
         {
   /** HERE IS CHESS METHOD (NO MPI) **/
-          
+
           /*
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
@@ -302,25 +302,25 @@ solve( const MeshPointer& mesh,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           
+
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-           
+
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
            helpFunc.template getData< Device>(),
            auxPtr.template modifyData< Device>(),
-           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, 
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           
+
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-           
+
            calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
           */
   /**------------------------------------------------------------------------------------------------*/
-          
-          
+
+
   /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
@@ -328,10 +328,10 @@ solve( const MeshPointer& mesh,
                   blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           // Switching helpFunc and auxPtr.
           auxPtr.swap( helpFunc );
-          
+
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
@@ -340,15 +340,15 @@ solve( const MeshPointer& mesh,
           blockCalculationIndicator = blockCalculationIndicatorHelp;
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
-          
+
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
          if( calculateCudaBlocksAgain ){
             calculatedBefore = 1;
           }
-          
+
 /**-----------------------------------------------------------------------------------------------------------*/
           numIter ++;
         }
@@ -364,13 +364,13 @@ solve( const MeshPointer& mesh,
 #endif
       }
 
-      
-/**----------------------MPI-TO-DO---------------------------------------------**/        
+
+/**----------------------MPI-TO-DO---------------------------------------------**/
 #ifdef HAVE_MPI
       if( CommunicatorType::isDistributed() ){
         getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
-       
-        synchronizer.template synchronize< Communicator >( aux );
+
+        synchronizer.synchronize( aux );
       }
 #endif
       if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
@@ -384,9 +384,9 @@ solve( const MeshPointer& mesh,
 
 // PROTECTED FUNCTIONS:
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
               const MeshPointer& mesh)
@@ -406,11 +406,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 
 
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-bool 
+bool
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
-goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
         MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
         const AnisotropyPointer& anisotropy )
 {
@@ -418,10 +418,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   const MeshType& mesh = aux.getMesh();
   const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
   const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
-  
+
   typename MeshType::Cell cell( mesh );
   cell.refresh();
-  
+
   for( cell.getCoordinates().y() = boundsFrom[1];
           TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
           cell.getCoordinates().y() += stepY )
@@ -444,54 +444,54 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
 
 
 #ifdef HAVE_MPI
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
-  
+
   int calculateFromNeighbours[4] = {0,0,0,0};
   const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
   MPI::Request *requestsInformation;
-  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
-  
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];
+
   int neighCount = 0; // should this thread calculate again?
-  
+
   if( neighbours[0] != -1 ) // LEFT
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[1] != -1 ) // RIGHT
   {
     requestsInformation[neighCount++] =
-            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
-    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[2] != -1 ) //UP
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
     requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup  );
   }
-  
+
   if( neighbours[5] != -1 ) //DOWN
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
   }
   MPI::WaitAll( requestsInformation, neighCount );
-  
+
   MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
   calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
               calculateFromNeighbours[2] || calculateFromNeighbours[3];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index add4d9610b1fff9bd08d8901a3a708f94d66f1b1..9468ff1db32fe86e2546052dfa77f9dc268d1182 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -4,7 +4,7 @@
  * and open the template in the editor.
  */
 
-/* 
+/*
  * File:   tnlFastSweepingMethod2D_impl.h
  * Author: oberhuber
  *
@@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisot
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-  
+
 }
 
 template< typename Real,
@@ -36,7 +36,7 @@ const Index&
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
-  
+
 }
 
 template< typename Real,
@@ -48,7 +48,7 @@ void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-  
+
 }
 
 template< typename Real,
@@ -67,46 +67,46 @@ solve( const MeshPointer& mesh,
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
-  
+
   // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
   Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps;
   setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
-  
+
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
-  auxPtr->save( "aux-ini.tnl" );   
-  
+  auxPtr->save( "aux-ini.tnl" );
+
   typename MeshType::Cell cell( *mesh );
-  
+
   IndexType iteration( 0 );
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
   synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() );
-  synchronizer.template synchronize< Communicator >( aux ); //synchronization of intial conditions
-  
+  synchronizer.synchronize( aux ); //synchronization of intial conditions
+
   while( iteration < this->maxIterations )
   {
-    // indicates weather we calculated in the last passage of the while cycle 
-    // calculatedBefore is same for all ranks 
+    // indicates weather we calculated in the last passage of the while cycle
+    // calculatedBefore is same for all ranks
     // without MPI should be FALSE at the end of while cycle body
-    int calculatedBefore = 1; 
-    
+    int calculatedBefore = 1;
+
     // indicates if the MPI process should calculate again in upcoming passage of cycle
     // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
-    int calculateMPIAgain = 1; 
-    
+    int calculateMPIAgain = 1;
+
     while( calculatedBefore )
     {
       calculatedBefore = 0;
-      
+
       if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
         calculateMPIAgain = 0;
-        
+
 /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
         /*int numThreadsPerBlock = -1;
-         
+
          numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
          if( numThreadsPerBlock <= 16 )
@@ -124,26 +124,26 @@ solve( const MeshPointer& mesh,
          else
          numThreadsPerBlock = 1024;
          //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-         
+
          if( numThreadsPerBlock == -1 ){
             printf("Fail in setting numThreadsPerBlock.\n");
          break;
          }
-         
+
          int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
          //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-         
+
          //Real **sArray = new Real*[numBlocksX*numBlocksY];
          // for( int i = 0; i < numBlocksX * numBlocksY; i++ )
          // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
-         
+
          ArrayContainer BlockIterHost;
          BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
          BlockIterHost.setValue( 1 );
          int IsCalculationDone = 1;
-         
+
          MeshFunctionPointer helpFunc( mesh );
          MeshFunctionPointer helpFunc1( mesh );
          helpFunc1 = auxPtr;
@@ -159,7 +159,7 @@ solve( const MeshPointer& mesh,
          // std::cout<<std::endl;
          unsigned int numWhile = 0;
          while( IsCalculationDone  )
-         {      
+         {
          IsCalculationDone = 0;
          helpFunc1 = auxPtr;
          auxPtr = helpFunc;
@@ -180,7 +180,7 @@ solve( const MeshPointer& mesh,
          default:
          this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
          }
-         //Reduction      
+         //Reduction
          for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
          IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -188,10 +188,10 @@ solve( const MeshPointer& mesh,
          }
          }
          numWhile++;
-         
-         
+
+
          this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
-         
+
          //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
          //aux.save( s );
          }
@@ -200,60 +200,60 @@ solve( const MeshPointer& mesh,
          }
          aux = *auxPtr;*/
 /**------------------------------------------------------------------------------*/
-        
-        
+
+
 /** HERE IS FSM WITH MPI AND WITHOUT MPI */
         StaticVector boundsFrom; StaticVector boundsTo;
-        
-    // TOP, NORTH and EAST        
+
+    // TOP, NORTH and EAST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, NORTH and WEST        
+
+    // TOP, NORTH and WEST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, SOUTH and EAST        
+
+    // TOP, SOUTH and EAST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // TOP, SOUTH and WEST        
+
+    // TOP, SOUTH and WEST
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-            
-    // BOTTOM, NOTH and EAST        
+
+    // BOTTOM, NOTH and EAST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); 
-        
-    // BOTTOM, NOTH and WEST        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+    // BOTTOM, NOTH and WEST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
-        
-    // BOTTOM, SOUTH and EAST        
+
+    // BOTTOM, SOUTH and EAST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
-        
-    // BOTTOM, SOUTH and WEST        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+    // BOTTOM, SOUTH and WEST
         boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
-        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
-        
-        
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+
+
   /**----------------------------------------------------------------------------------*/
       }
       if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain )
@@ -263,50 +263,50 @@ solve( const MeshPointer& mesh,
         // the number should be less than 10^3 (num of threads in one grid is maximally 1024)
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 8 );
-        
+
         // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
         int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
         int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
-        int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize );
         if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
           std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-        
+
         // Making the variables for global function CudaUpdateCellCaller.
         dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-        
+
         BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
-        
-        
+
+
         int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
-        
+
         // Array containing information about each block in grid, answering question (Have we calculated in this block?)
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
         BlockIterDevice.setValue( 1 ); // calculate all in the first passage
-        
+
         // Helping Array for GetNeighbours3D
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
         BlockIterPom.setValue( 0 ); //doesnt matter what number
-        
-        
-        
+
+
+
         // number of neighbours in one block (1024 threads) for GetNeighbours3D
         int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
-        
-        
-        //MeshFunctionPointer helpFunc1( mesh );      
+
+
+        //MeshFunctionPointer helpFunc1( mesh );
         Containers::Vector< RealType, DeviceType, IndexType > helpVec;
         helpVec.setLike( auxPtr.template getData().getData() );
         MeshFunctionPointer helpFunc;
         helpFunc->bind( mesh, helpVec );
         helpFunc.template modifyData() = auxPtr.template getData();
         Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-                
+
         int numIter = 0; // number of passages of following while cycle
-        
+
         while( BlockIterD ) //main body of cuda code
         {
-          
+
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // main function that calculates all values in each blocks
           // calculated values are in helpFunc
@@ -319,7 +319,7 @@ solve( const MeshPointer& mesh,
           TNL_CHECK_CUDA_DEVICE;
           // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
           auxPtr.swap( helpFunc );
-          
+
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
           // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
@@ -328,23 +328,23 @@ solve( const MeshPointer& mesh,
           TNL_CHECK_CUDA_DEVICE;
           BlockIterDevice = BlockIterPom;
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-          
+
           // .containsValue(1) is actually parallel reduction implemented in TNL
           BlockIterD = BlockIterDevice.containsValue(1);
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
+
           numIter++;
-          if( BlockIterD ){ 
+          if( BlockIterD ){
             // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
             calculatedBefore = 1;
           }
         }
         if( numIter%2 == 1 ){
-          
+
           // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
           // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
-          auxPtr.swap( helpFunc ); 
+          auxPtr.swap( helpFunc );
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
         }
         cudaDeviceSynchronize();
@@ -353,35 +353,35 @@ solve( const MeshPointer& mesh,
         interfaceMap = *interfaceMapPtr;
 #endif
       }
-      
+
 #ifdef HAVE_MPI
       if( CommunicatorType::isDistributed() )
       {
         getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
 
-        // synchronizate the overlaps 
-        synchronizer.template synchronize< Communicator >( aux );
+        // synchronizate the overlaps
+        synchronizer.synchronize( aux );
 
       }
 #endif
-      
+
       if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0!
         calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
     }
     //aux.save( "aux-8.tnl" );
     iteration++;
-    
+
   }
   // Saving the results into Aux for MakeSnapshot function.
-  Aux = auxPtr; 
+  Aux = auxPtr;
   aux.save("aux-final.tnl");
 }
 
 // PROTECTED FUNCTIONS:
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
               const MeshPointer& mesh)
@@ -402,11 +402,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 
 
 
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-bool 
+bool
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
-goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
         MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
         const AnisotropyPointer& anisotropy )
 {
@@ -415,10 +415,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
   const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
   const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1;
-  
+
   typename MeshType::Cell cell( mesh );
   cell.refresh();
-  
+
   for( cell.getCoordinates().z() = boundsFrom[2];
           TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0;
           cell.getCoordinates().z() += stepZ )
@@ -446,72 +446,72 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
 
 
 #ifdef HAVE_MPI
-template< typename Real, typename Device, typename Index, 
+template< typename Real, typename Device, typename Index,
           typename Communicator, typename Anisotropy >
-void 
+void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
-  
+
   int calculateFromNeighbours[6] = {0,0,0,0,0,0};
-        
+
   const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
   MPI::Request *requestsInformation;
-  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
-  
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];
+
   int neighCount = 0; // should this thread calculate again?
-  
+
   if( neighbours[0] != -1 ) // WEST
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[1] != -1 ) // EAST
   {
     requestsInformation[neighCount++] =
-            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
-    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[2] != -1 ) //NORTH
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
     requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[5] != -1 ) //SOUTH
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
   }
-  
-  if( neighbours[8] != -1 ) // TOP 
+
+  if( neighbours[8] != -1 ) // TOP
   {
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup );
   }
-  
+
   if( neighbours[17] != -1 ) //BOTTOM
   {
     requestsInformation[neighCount++] =
             MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup );
-    requestsInformation[neighCount++] = 
+    requestsInformation[neighCount++] =
             MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup );
   }
-  
+
   MPI::WaitAll( requestsInformation, neighCount );
-  
+
   MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
   calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
                       calculateFromNeighbours[2] || calculateFromNeighbours[3] ||
diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h
index 3cc0af53ae824a9fa8f79bf6f505dbeaa26465cf..b9ec101cf60d2bb3d266cb92d07bfbccac1eb28f 100644
--- a/src/TNL/Functions/CutMeshFunction.h
+++ b/src/TNL/Functions/CutMeshFunction.h
@@ -14,9 +14,8 @@
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
-namespace Functions {  
-template <  typename CommunicatorType,
-            typename MeshFunctionType,
+namespace Functions {
+template <  typename MeshFunctionType,
             typename OutMesh,
             typename OutDof,
             int outDimension=OutMesh::getMeshDimension(),
@@ -25,10 +24,10 @@ class CutMeshFunction
 {
   public:
     static bool Cut(MeshFunctionType &inputMeshFunction,
-                    OutMesh &outMesh, 
+                    OutMesh &outMesh,
                     OutDof &outData,
-                    Containers::StaticVector<outDimension, int> savedDimensions, 
-                    Containers::StaticVector<codimension,int> reducedDimensions, 
+                    Containers::StaticVector<outDimension, int> savedDimensions,
+                    Containers::StaticVector<codimension,int> reducedDimensions,
                     Containers::StaticVector<codimension,typename MeshFunctionType::IndexType> fixedIndexs )
     {
         bool inCut;
@@ -44,7 +43,7 @@ class CutMeshFunction
             auto toDistributedGrid=outMesh.getDistributedMesh();
             TNL_ASSERT_TRUE(toDistributedGrid!=nullptr,"You are trying cut distributed meshfunction, but output grid is not set up for distribution");
 
-            inCut=toDistributedGrid-> template SetupByCut<CommunicatorType>(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs);
+            inCut=toDistributedGrid->SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs);
             if(inCut)
             {
                toDistributedGrid->setupGrid(outMesh);
@@ -56,7 +55,7 @@ class CutMeshFunction
         {
             typename OutMesh::PointType outOrigin;
             typename OutMesh::PointType outProportions;
-            typename OutMesh::CoordinatesType outDimensions; 
+            typename OutMesh::CoordinatesType outDimensions;
 
             for(int i=0; i<outDimension;i++)
             {
@@ -64,13 +63,13 @@ class CutMeshFunction
                 outProportions[i]=fromMesh.getProportions()[savedDimensions[i]];
                 outDimensions[i]=fromMesh.getDimensions()[savedDimensions[i]];
             }
-            
+
             outMesh.setDimensions(outDimensions);
             outMesh.setDomain(outOrigin,outProportions);
-            
+
             inCut=true;
             localFixedIndexs=fixedIndexs;
-            
+
         }
 
         //copy data
@@ -104,7 +103,7 @@ class CutMeshFunction
         }
 
         return inCut;
-    } 
+    }
 };
 
 } // namespace Functions
diff --git a/src/TNL/MPI.h b/src/TNL/MPI.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5f9145b5fe727ed2e9729e59d08543840d610e5
--- /dev/null
+++ b/src/TNL/MPI.h
@@ -0,0 +1,31 @@
+/***************************************************************************
+                          MPI.h  -  description
+                             -------------------
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+/**
+ * \brief A convenient header file which includes all headers from the
+ * `TNL/MPI/` subdirectory.
+ *
+ * Users may use this to avoid having to include many header files in their
+ * projects. On the other hand, parts of the TNL library should generally
+ * include only the specific headers they need, in order to avoid cycles in
+ * the header inclusion.
+ */
+
+#include "MPI/DummyDefs.h"
+#include "MPI/getDataType.h"
+#include "MPI/Profiling.h"
+#include "MPI/selectGPU.h"
+#include "MPI/Wrappers.h"
+#include "MPI/Utils.h"
+#include "MPI/ScopedInitializer.h"
+#include "MPI/Config.h"
+#include "MPI/Print.h"
diff --git a/src/TNL/MPI/Config.h b/src/TNL/MPI/Config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d560b1d5589d54f7dab5652dc7fd15ba5f95d941
--- /dev/null
+++ b/src/TNL/MPI/Config.h
@@ -0,0 +1,103 @@
+/***************************************************************************
+                          MPI/Config.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+
+#ifdef HAVE_MPI
+#ifdef OMPI_MAJOR_VERSION
+   // header specific to OpenMPI (needed for CUDA-aware detection)
+   #include <mpi-ext.h>
+#endif
+
+#include <unistd.h>  // getpid
+#endif
+
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+#include "Utils.h"
+
+namespace TNL {
+namespace MPI {
+
+inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" )
+{
+#ifdef HAVE_MPI
+   config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true );
+   config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." );
+   config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false );
+   config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 );
+#endif
+}
+
+inline bool setup( const Config::ParameterContainer& parameters,
+                   const String& prefix = "" )
+{
+#ifdef HAVE_MPI
+   if( Initialized() && ! Finalized() )
+   {
+      const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" );
+      const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" );
+      if( redirect )
+         MPI::setupRedirection( outputDirectory );
+#ifdef HAVE_CUDA
+      if( GetSize() > 1 )
+      {
+#if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT
+         std::cout << "CUDA-aware MPI detected on this system ... " << std::endl;
+#elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT
+         std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl;
+         return false;
+#else
+         std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl;
+#endif
+      }
+#endif // HAVE_CUDA
+      bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" );
+      int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" );
+
+      if( gdbDebug )
+      {
+         int rank = GetRank( MPI_COMM_WORLD );
+         int pid = getpid();
+
+         volatile int tnlMPIDebugAttached = 0;
+         MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD );
+         MPI_Barrier( MPI_COMM_WORLD );
+         if( rank == 0 )
+         {
+            std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl;
+            for( int i = 0; i < GetSize(); i++ )
+            {
+               MPI_Status status;
+               int recvPid;
+               MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status );
+
+               if( i == processToAttach || processToAttach == -1 )
+               {
+                  std::cout << "  For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\""
+                            << " -ex \"set variable tnlMPIDebugAttached=1\""
+                            << " -ex \"continue\"" << std::endl;
+               }
+            }
+            std::cout << std::flush;
+         }
+         if( rank == processToAttach || processToAttach == -1 )
+            while( ! tnlMPIDebugAttached );
+         MPI_Barrier( MPI_COMM_WORLD );
+      }
+   }
+#endif // HAVE_MPI
+   return true;
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/DummyDefs.h b/src/TNL/MPI/DummyDefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..578e46dfef428084937b7be0d034a0cd5bc4a840
--- /dev/null
+++ b/src/TNL/MPI/DummyDefs.h
@@ -0,0 +1,51 @@
+/***************************************************************************
+                          MPI/DummyDefs.h  -  description
+                             -------------------
+    begin                : Dec 29, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifndef HAVE_MPI
+using MPI_Request = int;
+using MPI_Comm = int;
+
+enum MPI_Op {
+   MPI_MAX,
+   MPI_MIN,
+   MPI_SUM,
+   MPI_PROD,
+   MPI_LAND,
+   MPI_BAND,
+   MPI_LOR,
+   MPI_BOR,
+   MPI_LXOR,
+   MPI_BXOR,
+   MPI_MINLOC,
+   MPI_MAXLOC,
+};
+
+// MPI_Init_thread constants
+enum {
+   MPI_THREAD_SINGLE,
+   MPI_THREAD_FUNNELED,
+   MPI_THREAD_SERIALIZED,
+   MPI_THREAD_MULTIPLE
+};
+
+// Miscellaneous constants
+#define MPI_ANY_SOURCE         -1                      /* match any source rank */
+#define MPI_PROC_NULL          -2                      /* rank of null process */
+#define MPI_ROOT               -4                      /* special value for intercomms */
+#define MPI_ANY_TAG            -1                      /* match any message tag */
+#define MPI_UNDEFINED          -32766                  /* undefined stuff */
+#define MPI_DIST_GRAPH         3                       /* dist graph topology */
+#define MPI_CART               1                       /* cartesian topology */
+#define MPI_GRAPH              2                       /* graph topology */
+#define MPI_KEYVAL_INVALID     -1                      /* invalid key value */
+
+#endif
diff --git a/src/TNL/Communicators/MPIPrint.h b/src/TNL/MPI/Print.h
similarity index 75%
rename from src/TNL/Communicators/MPIPrint.h
rename to src/TNL/MPI/Print.h
index 6d78eafaf8c67c1c770faf01fa879d4b31b4032a..5cd4819a2951cf46093eecb6ab5052a7b278e155 100644
--- a/src/TNL/Communicators/MPIPrint.h
+++ b/src/TNL/MPI/Print.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          MPIPrint.h  -  description
+                          MPI/Print.h  -  description
                              -------------------
     begin                : Feb 7, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -10,34 +10,35 @@
 
 #pragma once
 
+#include <iostream>
 #include <sstream>
-#include <TNL/Communicators/MpiCommunicator.h>
+
+#include <TNL/String.h>
+#include <TNL/MPI/Wrappers.h>
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT( message )                                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       std::stringstream __tnl_mpi_print_stream_;                                                                                 \
-      __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                             \
-         << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                     \
+      __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                        \
+                              << message << std::endl;                                                                           \
       TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                      \
       mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                                  \
    }                                                                                                                             \
    else                                                                                                                          \
    {                                                                                                                             \
-      std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;              \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
-         {                                                                                                                       \
-            TNL::String __tnl_mpi_print_string_;                                                                                 \
-            mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                         \
-            std::cerr << __tnl_mpi_print_string_;                                                                                \
-         }                                                                                                                       \
+      std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                         \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
+      {                                                                                                                          \
+         TNL::String __tnl_mpi_print_string_;                                                                                    \
+         mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() );                            \
+         std::cerr << __tnl_mpi_print_string_;                                                                                   \
+      }                                                                                                                          \
    }                                                                                                                             \
 }
 #else
@@ -47,11 +48,11 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_MASTER( message )                                                                                          \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
    std::cerr << message << std::endl;                                                                                            \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() == 0 )                                                                     \
+   if( TNL::MPI::GetRank() == 0 )                                                                     \
    {                                                                                                                             \
       std::cerr << "Master node : " << message << std::endl;                                                                     \
    }                                                                                                                             \
@@ -63,20 +64,20 @@ else
 
 #ifdef HAVE_MPI
 #define TNL_MPI_PRINT_COND( condition, message )                                                                                 \
-if( ! TNL::Communicators::MpiCommunicator::IsInitialized() )                                                                     \
+if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() )                                                                         \
 {                                                                                                                                \
    if( condition) std::cerr << message << std::endl;                                                                             \
 }                                                                                                                                \
 else                                                                                                                             \
 {                                                                                                                                \
-   if( TNL::Communicators::MpiCommunicator::GetRank() > 0 )                                                                      \
+   if( TNL::MPI::GetRank() > 0 )                                                                                                 \
    {                                                                                                                             \
       int __tnl_mpi_print_cnd = ( condition );                                                                                   \
-      TNL::Communicators::MpiCommunicator::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                \
+      TNL::MPI::Send( &__tnl_mpi_print_cnd, 1, 0, 0 );                                                                           \
       if( condition ) {                                                                                                          \
          std::stringstream __tnl_mpi_print_stream_;                                                                              \
-         __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of "                          \
-            << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;                                  \
+         __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : "                     \
+                                 << message << std::endl;                                                                        \
          TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() );                                                   \
          mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() );                                               \
       }                                                                                                                          \
@@ -84,13 +85,11 @@ else
    else                                                                                                                          \
    {                                                                                                                             \
       if( condition )                                                                                                            \
-         std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl;           \
-      for( int __tnl_mpi_print_j = 1;                                                                                            \
-           __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize();                                                   \
-           __tnl_mpi_print_j++ )                                                                                                 \
+         std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl;                                      \
+      for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ )                             \
          {                                                                                                                       \
             int __tnl_mpi_print_cond;                                                                                            \
-            TNL::Communicators::MpiCommunicator::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                         \
+            TNL::MPI::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 );                                                    \
             if( __tnl_mpi_print_cond )                                                                                           \
             {                                                                                                                    \
                TNL::String __tnl_mpi_print_string_;                                                                              \
diff --git a/src/TNL/MPI/Profiling.h b/src/TNL/MPI/Profiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50427c16b2f3210ded666cc36d564547a206e03
--- /dev/null
+++ b/src/TNL/MPI/Profiling.h
@@ -0,0 +1,25 @@
+/***************************************************************************
+                          MPI/Profiling.h  -  description
+                             -------------------
+    begin                : Jan 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Timer.h>
+
+namespace TNL {
+namespace MPI {
+
+inline Timer& getTimerAllreduce()
+{
+   static Timer t;
+   return t;
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h
similarity index 72%
rename from src/TNL/Communicators/ScopedInitializer.h
rename to src/TNL/MPI/ScopedInitializer.h
index 2970bc628319bdf9d4c40d7a2cb32694a8148f7d..82ba02bc5743611bfb4af7395142de730672d548 100644
--- a/src/TNL/Communicators/ScopedInitializer.h
+++ b/src/TNL/MPI/ScopedInitializer.h
@@ -12,22 +12,25 @@
 
 #pragma once
 
+#include "Wrappers.h"
+#include "Utils.h"
+
 namespace TNL {
-namespace Communicators {
+namespace MPI {
 
-template< typename Communicator >
 struct ScopedInitializer
 {
-   ScopedInitializer( int& argc, char**& argv )
+   ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
    {
-      Communicator::Init( argc, argv );
+      Init( argc, argv );
    }
 
    ~ScopedInitializer()
    {
-      Communicator::Finalize();
+      restoreRedirection();
+      Finalize();
    }
 };
 
-} // namespace Communicators
+} // namespace MPI
 } // namespace TNL
diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d334aaf5bc545ea1953aa7369dcf3669dc16ae0b
--- /dev/null
+++ b/src/TNL/MPI/Utils.h
@@ -0,0 +1,76 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Debugging/OutputRedirection.h>
+
+#include "Wrappers.h"
+
+namespace TNL {
+namespace MPI {
+
+inline bool isInitialized()
+{
+   return Initialized() && ! Finalized();
+}
+
+inline void setupRedirection( std::string outputDirectory )
+{
+#ifdef HAVE_MPI
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank()) + ".txt";
+      const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank()) + ".txt";
+      std::cout << GetRank() << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl;
+      Debugging::redirect_stdout_stderr( stdoutFile, stderrFile );
+   }
+#endif
+}
+
+// restore redirection (usually not necessary, it uses RAII internally...)
+inline void restoreRedirection()
+{
+   if( GetSize() > 1 && GetRank() != 0 ) {
+      Debugging::redirect_stdout_stderr( "", "", true );
+   }
+}
+
+/**
+ * \brief Returns a local rank ID of the current process within a group of
+ * processes running on a shared-memory node.
+ *
+ * The given MPI communicator is split into groups according to the
+ * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process
+ * within the group is returned.
+ */
+inline int getRankOnNode( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   const int rank = GetRank(group);
+
+   MPI_Info info;
+   MPI_Info_create( &info );
+
+   MPI_Comm local_comm;
+   MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm );
+
+   const int local_rank = GetRank( local_comm );
+
+   MPI_Comm_free(&local_comm);
+   MPI_Info_free(&info);
+
+   return local_rank;
+#else
+   return 0;
+#endif
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a455dcb75d4bba5d38b993cca932a6cb2c4ea2f
--- /dev/null
+++ b/src/TNL/MPI/Wrappers.h
@@ -0,0 +1,407 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#else
+   #include "DummyDefs.h"
+   #include <cstring>  // std::memcpy
+   #include <TNL/Exceptions/MPISupportMissing.h>
+#endif
+
+#include <TNL/Assert.h>
+#include "getDataType.h"
+#include "Profiling.h"
+
+namespace TNL {
+namespace MPI {
+
+// forward declaration to break cyclic inclusion
+inline void selectGPU();
+
+// function wrappers for MPI constants
+
+inline MPI_Comm AllGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_WORLD;
+#else
+   return 1;
+#endif
+}
+
+inline MPI_Comm NullGroup()
+{
+#ifdef HAVE_MPI
+   return MPI_COMM_NULL;
+#else
+   return 0;
+#endif
+}
+
+inline MPI_Request NullRequest()
+{
+#ifdef HAVE_MPI
+   return MPI_REQUEST_NULL;
+#else
+   return 0;
+#endif
+}
+
+// wrappers for basic MPI functions
+
+inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE )
+{
+#ifdef HAVE_MPI
+   switch( required_thread_level ) {
+      case MPI_THREAD_SINGLE:
+      case MPI_THREAD_FUNNELED:
+      case MPI_THREAD_SERIALIZED:
+      case MPI_THREAD_MULTIPLE:
+         break;
+      default:
+         std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl;
+         MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   int provided;
+   MPI_Init_thread( &argc, &argv, required_thread_level, &provided );
+   if( provided < required_thread_level ) {
+      const char* level = "";
+      switch( required_thread_level ) {
+         case MPI_THREAD_SINGLE:
+            level = "MPI_THREAD_SINGLE";
+            break;
+         case MPI_THREAD_FUNNELED:
+            level = "MPI_THREAD_FUNNELED";
+            break;
+         case MPI_THREAD_SERIALIZED:
+            level = "MPI_THREAD_SERIALIZED";
+            break;
+         case MPI_THREAD_MULTIPLE:
+            level = "MPI_THREAD_MULTIPLE";
+            break;
+      }
+      std::cerr << "ERROR: The MPI library does not have the required level of thread support: " << level << std::endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+   }
+
+   selectGPU();
+#endif
+}
+
+inline void Finalize()
+{
+#ifdef HAVE_MPI
+   MPI_Finalize();
+#endif
+}
+
+inline bool Initialized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Initialized(&flag);
+    return flag;
+#else
+    return true;
+#endif
+}
+
+inline bool Finalized()
+{
+#ifdef HAVE_MPI
+    int flag;
+    MPI_Finalized(&flag);
+    return flag;
+#else
+    return false;
+#endif
+}
+
+inline int GetRank( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetRank cannot be called with NullGroup" );
+   int rank;
+   MPI_Comm_rank( group, &rank );
+   return rank;
+#else
+   return 0;
+#endif
+}
+
+inline int GetSize( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "GetSize cannot be called with NullGroup" );
+   int size;
+   MPI_Comm_size( group, &size );
+   return size;
+#else
+   return 1;
+#endif
+}
+
+// wrappers for MPI helper functions
+
+inline MPI_Comm Comm_split( MPI_Comm comm, int color, int key )
+{
+#ifdef HAVE_MPI
+   MPI_Comm newcomm;
+   MPI_Comm_split( comm, color, key, &newcomm );
+   return newcomm;
+#else
+   return comm;
+#endif
+}
+
+/**
+ * \brief Wrapper for \ref MPI_Dims_create.
+ *
+ * \param nproc - number of processes in the group to be distributed
+ * \param ndims - number of dimensions of the Cartesian grid
+ * \param dims - distribution of processes into the \e dim-dimensional
+ *               Cartesian grid (array of length \e ndims)
+ *
+ * Negative input values of \e dims[i] are erroneous. An error will occur if
+ * \e nproc is not a multiple of the product of all non-zero values \e dims[i].
+ *
+ * See the MPI documentation for more information.
+ */
+inline void Compute_dims( int nproc, int ndims, int* dims )
+{
+#ifdef HAVE_MPI
+   int prod = 1;
+   for( int i = 0; i < ndims; i++ ) {
+      if( dims[ i ] < 0 )
+         throw std::invalid_argument( "Negative value passed to MPI::Compute_dims in the dims array argument." );
+      if( dims[ i ] > 0 )
+         prod *= dims[ i ];
+   }
+
+   if( nproc % prod != 0 )
+      throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions."
+            "The product of the non-zero values dims[i] is " + std::to_string(prod) + " and the "
+            "number of processes (" + std::to_string(nproc) + ") is not a multiple of the product." );
+
+   MPI_Dims_create( nproc, ndims, dims );
+#else
+   for( int i = 0; i < ndims; i++)
+      dims[ i ] = 1;
+#endif
+}
+
+// wrappers for MPI communication functions
+
+inline void Barrier( MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Barrier cannot be called with NullGroup" );
+   MPI_Barrier(group);
+#endif
+}
+
+inline void Waitall( MPI_Request* reqs, int length )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   MPI_Waitall( length, reqs, MPI_STATUSES_IGNORE );
+#endif
+}
+
+template< typename T >
+void Send( const T* data,
+           int count,
+           int dest,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Send cannot be called with NullGroup" );
+   MPI_Send( (const void*) data, count, getDataType<T>(), dest, tag, group );
+#endif
+}
+
+template< typename T >
+void Recv( T* data,
+           int count,
+           int src,
+           int tag,
+           MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Recv cannot be called with NullGroup" );
+   MPI_Recv( (void*) data, count, getDataType<T>(), src, tag, group, MPI_STATUS_IGNORE );
+#endif
+}
+
+template< typename T >
+void Sendrecv( const T* sendData,
+               int sendCount,
+               int destination,
+               int sendTag,
+               T* receiveData,
+               int receiveCount,
+               int source,
+               int receiveTag,
+               MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Sendrecv cannot be called with NullGroup" );
+   MPI_Sendrecv( (void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 destination,
+                 sendTag,
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 source,
+                 receiveTag,
+                 group,
+                 MPI_STATUS_IGNORE );
+#else
+   throw Exceptions::MPISupportMissing();
+#endif
+}
+
+template< typename T >
+MPI_Request Isend( const T* data,
+                   int count,
+                   int dest,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Isend cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Isend( (const void*) data, count, getDataType<T>(), dest, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+MPI_Request Irecv( T* data,
+                   int count,
+                   int src,
+                   int tag,
+                   MPI_Comm group = AllGroup() )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Irecv cannot be called with NullGroup" );
+   MPI_Request req;
+   MPI_Irecv( (void*) data, count, getDataType<T>(), src, tag, group, &req );
+   return req;
+#else
+   return NullRequest();
+#endif
+}
+
+template< typename T >
+void Allreduce( const T* data,
+                T* reduced_data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   getTimerAllreduce().start();
+   MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, group );
+   getTimerAllreduce().stop();
+#else
+   std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) );
+#endif
+}
+
+// in-place variant of Allreduce
+template< typename T >
+void Allreduce( T* data,
+                int count,
+                const MPI_Op& op,
+                MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" );
+   getTimerAllreduce().start();
+   MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType<T>(), op, group );
+   getTimerAllreduce().stop();
+#endif
+}
+
+template< typename T >
+void Reduce( const T* data,
+             T* reduced_data,
+             int count,
+             const MPI_Op& op,
+             int root,
+             MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Reduce cannot be called with NullGroup" );
+   MPI_Reduce( (const void*) data, (void*) reduced_data, count, getDataType<T>(), op, root, group );
+#else
+   std::memcpy( (void*) reduced_data, (void*) data, count * sizeof(T) );
+#endif
+}
+
+template< typename T >
+void Bcast( T* data, int count, int root, MPI_Comm group)
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" );
+   TNL_ASSERT_NE( group, NullGroup(), "Bcast cannot be called with NullGroup" );
+   MPI_Bcast( (void*) data, count, getDataType<T>(), root, group );
+#endif
+}
+
+template< typename T >
+void Alltoall( const T* sendData,
+               int sendCount,
+               T* receiveData,
+               int receiveCount,
+               MPI_Comm group )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_NE( group, NullGroup(), "Alltoall cannot be called with NullGroup" );
+   MPI_Alltoall( (const void*) sendData,
+                 sendCount,
+                 getDataType<T>(),
+                 (void*) receiveData,
+                 receiveCount,
+                 getDataType<T>(),
+                 group );
+#else
+   TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." );
+   std::memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof(T) );
+#endif
+}
+
+} // namespace MPI
+} // namespace TNL
+
+// late inclusion to break cyclic inclusion
+#include "selectGPU.h"
diff --git a/src/TNL/MPI/getDataType.h b/src/TNL/MPI/getDataType.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3570679bf2708cca08de3e485890588396a051e
--- /dev/null
+++ b/src/TNL/MPI/getDataType.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+                          getDataType.h  -  description
+                             -------------------
+    begin                : Feb 4, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_MPI
+   #include <mpi.h>
+#endif
+
+namespace TNL {
+namespace MPI {
+
+#ifdef HAVE_MPI
+template< typename T >
+struct TypeResolver
+{
+   static inline MPI_Datatype getType()
+   {
+      static_assert( sizeof(T) == sizeof(char) ||
+                     sizeof(T) == sizeof(int) ||
+                     sizeof(T) == sizeof(short int) ||
+                     sizeof(T) == sizeof(long int),
+                     "Fatal Error - Unknown MPI Type");
+      switch( sizeof(T) )
+      {
+         case sizeof(char):
+            return MPI_CHAR;
+         case sizeof(int):
+            return MPI_INT;
+         case sizeof(short int):
+            return MPI_SHORT;
+         case sizeof(long int):
+            return MPI_LONG;
+      }
+      // This will never happen thanks to the static_assert above, but icpc is
+      // not that smart and complains about missing return statement at the end
+      // of non-void function.
+      throw 0;
+   }
+};
+
+template<> struct TypeResolver< char >
+{
+   static inline MPI_Datatype getType(){return MPI_CHAR;};
+};
+
+template<> struct TypeResolver< int >
+{
+   static inline MPI_Datatype getType(){return MPI_INT;};
+};
+
+template<> struct TypeResolver< short int >
+{
+   static inline MPI_Datatype getType(){return MPI_SHORT;};
+};
+
+template<> struct TypeResolver< long int >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG;};
+};
+
+template<> struct TypeResolver< unsigned char >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;};
+};
+
+template<> struct TypeResolver< unsigned short int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;};
+};
+
+template<> struct TypeResolver< unsigned int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED;};
+};
+
+template<> struct TypeResolver< unsigned long int >
+{
+   static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;};
+};
+
+template<> struct TypeResolver< float >
+{
+   static inline MPI_Datatype getType(){return MPI_FLOAT;};
+};
+
+template<> struct TypeResolver< double >
+{
+   static inline MPI_Datatype getType(){return MPI_DOUBLE;};
+};
+
+template<> struct TypeResolver< long double >
+{
+   static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;};
+};
+
+template<> struct TypeResolver< bool >
+{
+   // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859
+   static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." );
+   static inline MPI_Datatype getType() { return MPI_C_BOOL; };
+};
+
+template< typename T >
+MPI_Datatype getDataType( const T& = T{} )
+{
+   return TypeResolver< T >::getType();
+}
+#endif
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..781a52809a0151f30b3c031acbb7aaadf51b766d
--- /dev/null
+++ b/src/TNL/MPI/selectGPU.h
@@ -0,0 +1,37 @@
+/***************************************************************************
+                          MPI/Wrappers.h  -  description
+                             -------------------
+    begin                : Apr 23, 2005
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Cuda/CheckDevice.h>
+
+#include "Utils.h"
+
+namespace TNL {
+namespace MPI {
+
+inline void selectGPU()
+{
+#ifdef HAVE_MPI
+#ifdef HAVE_CUDA
+   int gpuCount;
+   cudaGetDeviceCount(&gpuCount);
+
+   const int local_rank = getRankOnNode();
+   const int gpuNumber = local_rank % gpuCount;
+
+   cudaSetDevice(gpuNumber);
+   TNL_CHECK_CUDA_DEVICE;
+#endif
+#endif
+}
+
+} // namespace MPI
+} // namespace TNL
diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index faa220da69975ed3a3a963fcf48186fb6a98740b..61e4eabb6dd1a3629e0f95ac67332386a4f94760 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -14,7 +14,6 @@
 
 #include <type_traits>
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Containers/DistributedVectorView.h>
@@ -23,65 +22,42 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename T, typename R = void >
-struct enable_if_type
-{
-   using type = R;
-};
-
-template< typename T, typename Enable = void >
-struct has_communicator : std::false_type {};
-
-template< typename T >
-struct has_communicator< T, typename enable_if_type< typename T::CommunicatorType >::type >
-: std::true_type
-{};
-
-
 // TODO: 2D distribution for dense matrices (maybe it should be in different template,
 //       because e.g. setRowFast doesn't make sense for dense matrices)
-template< typename Matrix,
-          typename Communicator = Communicators::MpiCommunicator >
+template< typename Matrix >
 class DistributedMatrix
 {
-   using CommunicationGroup = typename Communicator::CommunicationGroup;
 public:
    using MatrixType = Matrix;
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
-   using CommunicatorType = Communicator;
    using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >;
 
-   using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >;
+   using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType >;
 
    using MatrixRow = typename Matrix::RowView;
    using ConstMatrixRow = typename Matrix::ConstRowView;
 
    template< typename _Real = RealType,
              typename _Device = DeviceType,
-             typename _Index = IndexType,
-             typename _Communicator = Communicator >
-   using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index >, _Communicator >;
+             typename _Index = IndexType >
+   using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index > >;
 
    DistributedMatrix() = default;
 
    DistributedMatrix( DistributedMatrix& ) = default;
 
-   DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup );
+   DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() );
 
-   void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup );
+   void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() );
 
-   __cuda_callable__
    const LocalRangeType& getLocalRowRange() const;
 
-   __cuda_callable__
-   CommunicationGroup getCommunicationGroup() const;
+   MPI_Comm getCommunicationGroup() const;
 
-   __cuda_callable__
    const Matrix& getLocalMatrix() const;
 
-   __cuda_callable__
    Matrix& getLocalMatrix();
 
 
@@ -99,10 +75,8 @@ public:
 
    void reset();
 
-   __cuda_callable__
    IndexType getRows() const;
 
-   __cuda_callable__
    IndexType getColumns() const;
 
    template< typename RowCapacitiesVector >
@@ -120,20 +94,17 @@ public:
    RealType getElement( IndexType row,
                         IndexType column ) const;
 
-   __cuda_callable__
    RealType getElementFast( IndexType row,
                             IndexType column ) const;
 
-   __cuda_callable__
    MatrixRow getRow( IndexType row );
 
-   __cuda_callable__
    ConstMatrixRow getRow( IndexType row ) const;
 
    // multiplication with a global vector
    template< typename InVector,
              typename OutVector >
-   typename std::enable_if< ! has_communicator< InVector >::value >::type
+   typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type
    vectorProduct( const InVector& inVector,
                   OutVector& outVector ) const;
 
@@ -144,7 +115,7 @@ public:
    // (not const because it modifies internal bufers)
    template< typename InVector,
              typename OutVector >
-   typename std::enable_if< has_communicator< InVector >::value >::type
+   typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type
    vectorProduct( const InVector& inVector,
                   OutVector& outVector ) const;
 
@@ -158,10 +129,10 @@ public:
 protected:
    LocalRangeType localRowRange;
    IndexType rows = 0;  // global rows count
-   CommunicationGroup group = Communicator::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    Matrix localMatrix;
 
-   DistributedSpMV< Matrix, Communicator > spmv;
+   DistributedSpMV< Matrix > spmv;
 };
 
 } // namespace Matrices
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index 806703ca6a28ea647d1760010b3a4febe9a0e439..8bc5d09820d0d7961bf91710f97b7eb4247dce1f 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -17,64 +17,54 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename Matrix,
-          typename Communicator >
-DistributedMatrix< Matrix, Communicator >::
-DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group )
+template< typename Matrix >
+DistributedMatrix< Matrix >::
+DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group )
 {
    setDistribution( localRowRange, rows, columns, group );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
-setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group )
+DistributedMatrix< Matrix >::
+setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group )
 {
    this->localRowRange = localRowRange;
    this->rows = rows;
    this->group = group;
-   if( group != Communicator::NullGroup )
+   if( group != MPI::NullGroup() )
       localMatrix.setDimensions( localRowRange.getSize(), columns );
 
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 const Containers::Subrange< typename Matrix::IndexType >&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalRowRange() const
 {
    return localRowRange;
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-typename Communicator::CommunicationGroup
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+MPI_Comm
+DistributedMatrix< Matrix >::
 getCommunicationGroup() const
 {
    return group;
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 const Matrix&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalMatrix() const
 {
    return localMatrix;
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 Matrix&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getLocalMatrix()
 {
    return localMatrix;
@@ -85,10 +75,9 @@ getLocalMatrix()
  * Some common Matrix methods follow below.
  */
 
-template< typename Matrix,
-          typename Communicator >
-DistributedMatrix< Matrix, Communicator >&
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+DistributedMatrix< Matrix >&
+DistributedMatrix< Matrix >::
 operator=( const DistributedMatrix& matrix )
 {
    setLike( matrix );
@@ -96,11 +85,10 @@ operator=( const DistributedMatrix& matrix )
    return *this;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename MatrixT >
-DistributedMatrix< Matrix, Communicator >&
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >&
+DistributedMatrix< Matrix >::
 operator=( const MatrixT& matrix )
 {
    setLike( matrix );
@@ -108,11 +96,10 @@ operator=( const MatrixT& matrix )
    return *this;
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename MatrixT >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setLike( const MatrixT& matrix )
 {
    localRowRange = matrix.getLocalRowRange();
@@ -123,86 +110,77 @@ setLike( const MatrixT& matrix )
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 reset()
 {
    localRowRange.reset();
    rows = 0;
-   group = Communicator::NullGroup;
+   group = MPI::NullGroup();
    localMatrix.reset();
 
    spmv.reset();
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getRows() const
 {
    return rows;
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getColumns() const
 {
    return localMatrix.getColumns();
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename RowCapacitiesVector >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setRowCapacities( const RowCapacitiesVector& rowCapacities )
 {
    TNL_ASSERT_EQ( rowCapacities.getSize(), getRows(), "row lengths vector has wrong size" );
    TNL_ASSERT_EQ( rowCapacities.getLocalRange(), getLocalRowRange(), "row lengths vector has wrong distribution" );
    TNL_ASSERT_EQ( rowCapacities.getCommunicationGroup(), getCommunicationGroup(), "row lengths vector has wrong communication group" );
 
-   if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
+   if( getCommunicationGroup() != MPI::NullGroup() ) {
       localMatrix.setRowCapacities( rowCapacities.getConstLocalView() );
 
       spmv.reset();
    }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename Vector >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
-      rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
+   if( getCommunicationGroup() != MPI::NullGroup() ) {
+      rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() );
       auto localRowLengths = rowLengths.getLocalView();
       localMatrix.getCompressedRowLengths( localRowLengths );
    }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::IndexType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getRowCapacity( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRowCapacity( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 setElement( IndexType row,
             IndexType column,
             RealType value )
@@ -211,10 +189,9 @@ setElement( IndexType row,
    localMatrix.setElement( localRow, column, value );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 typename Matrix::RealType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getElement( IndexType row,
             IndexType column ) const
 {
@@ -222,11 +199,9 @@ getElement( IndexType row,
    return localMatrix.getElement( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
+template< typename Matrix >
 typename Matrix::RealType
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 getElementFast( IndexType row,
                 IndexType column ) const
 {
@@ -234,34 +209,29 @@ getElementFast( IndexType row,
    return localMatrix.getElementFast( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-typename DistributedMatrix< Matrix, Communicator >::MatrixRow
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+typename DistributedMatrix< Matrix >::MatrixRow
+DistributedMatrix< Matrix >::
 getRow( IndexType row )
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRow( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow
-DistributedMatrix< Matrix, Communicator >::
+template< typename Matrix >
+typename DistributedMatrix< Matrix >::ConstMatrixRow
+DistributedMatrix< Matrix >::
 getRow( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRow( localRow );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename InVector,
              typename OutVector >
-typename std::enable_if< ! has_communicator< InVector >::value >::type
-DistributedMatrix< Matrix, Communicator >::
+typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type
+DistributedMatrix< Matrix >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector ) const
 {
@@ -274,44 +244,57 @@ vectorProduct( const InVector& inVector,
    localMatrix.vectorProduct( inVector, outView );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
 void
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 updateVectorProductCommunicationPattern()
 {
-   if( getCommunicationGroup() == CommunicatorType::NullGroup )
+   if( getCommunicationGroup() == MPI::NullGroup() )
       return;
    spmv.updateCommunicationPattern( getLocalMatrix(), getCommunicationGroup() );
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename InVector,
              typename OutVector >
-typename std::enable_if< has_communicator< InVector >::value >::type
-DistributedMatrix< Matrix, Communicator >::
+typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type
+DistributedMatrix< Matrix >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector ) const
 {
-   TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" );
    TNL_ASSERT_EQ( inVector.getLocalRange(), getLocalRowRange(), "input vector has wrong distribution" );
    TNL_ASSERT_EQ( inVector.getCommunicationGroup(), getCommunicationGroup(), "input vector has wrong communication group" );
    TNL_ASSERT_EQ( outVector.getSize(), getRows(), "output vector has wrong size" );
    TNL_ASSERT_EQ( outVector.getLocalRange(), getLocalRowRange(), "output vector has wrong distribution" );
    TNL_ASSERT_EQ( outVector.getCommunicationGroup(), getCommunicationGroup(), "output vector has wrong communication group" );
 
-   if( getCommunicationGroup() == CommunicatorType::NullGroup )
+   if( getCommunicationGroup() == MPI::NullGroup() )
       return;
 
-   const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() );
+   if( inVector.getGhosts() == 0 ) {
+      // NOTE: this branch is deprecated and kept only due to existing benchmarks
+      TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" );
+      const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() );
+   }
+   else {
+      TNL_ASSERT_EQ( inVector.getConstLocalViewWithGhosts().getSize(), localMatrix.getColumns(), "the matrix uses non-local and non-ghost column indices" );
+      TNL_ASSERT_EQ( inVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "input vector has wrong ghosts size" );
+      TNL_ASSERT_EQ( outVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "output vector has wrong ghosts size" );
+      TNL_ASSERT_EQ( outVector.getConstLocalView().getSize(), localMatrix.getRows(), "number of local matrix rows does not match the output vector local size" );
+
+      inVector.waitForSynchronization();
+      const auto inView = inVector.getConstLocalViewWithGhosts();
+      auto outView = outVector.getLocalView();
+      localMatrix.vectorProduct( inView, outView );
+      // TODO: synchronization is not always necessary, e.g. when a preconditioning step follows
+//      outVector.startSynchronization();
+   }
 }
 
-template< typename Matrix,
-          typename Communicator >
+template< typename Matrix >
    template< typename Vector1, typename Vector2 >
 bool
-DistributedMatrix< Matrix, Communicator >::
+DistributedMatrix< Matrix >::
 performSORIteration( const Vector1& b,
                      const IndexType row,
                      Vector2& x,
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 76aaa77fef49997429db3ea47076a01db0f48997..bea864eadcbb368ef2748fbc3bfef47ad47a9abc 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -33,7 +33,7 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 class DistributedSpMV
 {
 public:
@@ -41,8 +41,6 @@ public:
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
-   using CommunicatorType = Communicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >;
 
    // - communication pattern: vector components whose indices are in the range
@@ -55,10 +53,10 @@ public:
    // - assembly of the i-th row involves traversal of the local matrix stored
    //   in the i-th process
    // - assembly of the full matrix needs all-to-all communication
-   void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, CommunicationGroup group )
+   void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, MPI_Comm group )
    {
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
       commPatternStarts.setDimensions( nproc, nproc );
       commPatternEnds.setDimensions( nproc, nproc );
 
@@ -67,9 +65,9 @@ public:
       {
          Containers::Array< IndexType, Devices::Host, int > sendbuf( nproc );
          sendbuf.setValue( localRowRange.getBegin() );
-         CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                     globalOffsets.getData(), 1,
-                                     group );
+         MPI::Alltoall( sendbuf.getData(), 1,
+                        globalOffsets.getData(), 1,
+                        group );
       }
       const auto globalOffsetsView = globalOffsets.getConstView();
       auto getOwner = [=] __cuda_callable__ ( IndexType global_idx ) -> int
@@ -150,12 +148,12 @@ public:
       }
 
       // assemble the commPattern* matrices
-      CommunicatorType::Alltoall( &preCommPatternStarts(0, 0), nproc,
-                                  &commPatternStarts(0, 0), nproc,
-                                  group );
-      CommunicatorType::Alltoall( &preCommPatternEnds(0, 0), nproc,
-                                  &commPatternEnds(0, 0), nproc,
-                                  group );
+      MPI::Alltoall( &preCommPatternStarts(0, 0), nproc,
+                     &commPatternStarts(0, 0), nproc,
+                     group );
+      MPI::Alltoall( &preCommPatternEnds(0, 0), nproc,
+                     &commPatternEnds(0, 0), nproc,
+                     group );
    }
 
    template< typename InVector,
@@ -164,10 +162,10 @@ public:
                        const MatrixType& localMatrix,
                        const LocalRangeType& localRowRange,
                        const InVector& inVector,
-                       CommunicationGroup group )
+                       MPI_Comm group )
    {
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // handle trivial case
       if( nproc == 1 ) {
@@ -190,14 +188,14 @@ public:
       TNL_ASSERT_EQ( globalBuffer.getSize(), localMatrix.getColumns(), "the global buffer size does not match the number of matrix columns" );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > commRequests;
+      std::vector< MPI_Request > commRequests;
 
       // send our data to all processes that need it
       for( int i = 0; i < commPatternStarts.getRows(); i++ ) {
          if( i == rank )
              continue;
          if( commPatternStarts( i, rank ) < commPatternEnds( i, rank ) )
-            commRequests.push_back( CommunicatorType::ISend(
+            commRequests.push_back( MPI::Isend(
                      inVector.getConstLocalView().getData() + commPatternStarts( i, rank ) - localRowRange.getBegin(),
                      commPatternEnds( i, rank ) - commPatternStarts( i, rank ),
                      i, 0, group ) );
@@ -208,7 +206,7 @@ public:
          if( j == rank )
              continue;
          if( commPatternStarts( rank, j ) < commPatternEnds( rank, j ) )
-            commRequests.push_back( CommunicatorType::IRecv(
+            commRequests.push_back( MPI::Irecv(
                      globalBuffer.getPointer( commPatternStarts( rank, j ) ),
                      commPatternEnds( rank, j ) - commPatternStarts( rank, j ),
                      j, 0, group ) );
@@ -217,7 +215,7 @@ public:
       // general variant
       if( localOnlySpan.first >= localOnlySpan.second ) {
          // wait for all communications to finish
-         CommunicatorType::WaitAll( commRequests.data(), commRequests.size() );
+         MPI::Waitall( commRequests.data(), commRequests.size() );
 
          // perform matrix-vector multiplication
          auto outVectorView = outVector.getLocalView();
@@ -231,7 +229,7 @@ public:
          localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
 
          // wait for all communications to finish
-         CommunicatorType::WaitAll( commRequests.data(), commRequests.size() );
+         MPI::Waitall( commRequests.data(), commRequests.size() );
 
          // finish the multiplication by adding the non-local entries
          localMatrix.vectorProduct( globalBuffer, outVectorView, 1.0, 0.0, 0, localOnlySpan.first );
diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
index 6030b976f038ab290ada814575db1bfb444ce694..04647cb4af883d07bf00e6ef177a8205c5be559c 100644
--- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -12,7 +12,6 @@
 
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Communicators/MPIPrint.h>
 
 namespace TNL {
 namespace Meshes {
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
index 6e346668ccaddcc0b124afc733d1a602d3dfadfa..4082024e378d844f9155bbabbedaccfd6f468599 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h
@@ -11,8 +11,6 @@
 
 #pragma once
 
-#include <iostream>
-
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Logger.h>
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
@@ -20,7 +18,7 @@
 
 
 namespace TNL {
-namespace Meshes { 
+namespace Meshes {
 namespace DistributedMeshes {
 
 
@@ -28,7 +26,7 @@ namespace DistributedMeshes {
 template< int Dimension,
           typename Real,
           typename Device,
-          typename Index >     
+          typename Index >
 class DistributedMesh< Grid< Dimension, Real, Device, Index > >
 {
   public:
@@ -41,44 +39,43 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       typedef Containers::StaticVector< Dimension, IndexType > CoordinatesType;
       typedef Containers::StaticVector< Dimension, IndexType > SubdomainOverlapsType;
 
-      static constexpr int getMeshDimension() { return Dimension; };  
+      static constexpr int getMeshDimension() { return Dimension; };
 
-      static constexpr int getNeighborsCount() { return DirectionCount<Dimension>::get(); } //c++14 may use Directions::pow3(Dimension)-1 
+      static constexpr int getNeighborsCount() { return DirectionCount<Dimension>::get(); } //c++14 may use Directions::pow3(Dimension)-1
 
       DistributedMesh();
 
       ~DistributedMesh();
-      
+
       static void configSetup( Config::ConfigDescription& config );
-      
+
       bool setup( const Config::ParameterContainer& parameters,
-                  const String& prefix );      
-    
+                  const String& prefix );
+
       void setDomainDecomposition( const CoordinatesType& domainDecomposition );
-      
+
       const CoordinatesType& getDomainDecomposition() const;
-      
-      template< typename CommunicatorType >
+
       void setGlobalGrid( const GridType& globalGrid );
-      
+
       const GridType& getGlobalGrid() const;
-      
+
       void setOverlaps( const SubdomainOverlapsType& lower,
                         const SubdomainOverlapsType& upper);
-      
+
       void setupGrid( GridType& grid);
 
       bool isDistributed() const;
-      
+
       bool isBoundarySubdomain() const;
-           
+
       // TODO: replace it with getLowerOverlap() and getUpperOverlap()
       // It is still being used in cuts set-up
       const CoordinatesType& getOverlap() const { return this->overlap;};
-      
+
       //currently used overlaps at this subdomain
       const SubdomainOverlapsType& getLowerOverlap() const;
-      
+
       const SubdomainOverlapsType& getUpperOverlap() const;
 
       //number of elements of local sub domain WITHOUT overlap
@@ -95,7 +92,7 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       //number of elements of local sub domain WITH overlap
       // TODO: replace with localGrid
       const CoordinatesType& getLocalGridSize() const;
-       
+
       //coordinates of begin of local subdomain without overlaps in local grid
       const CoordinatesType& getLocalBegin() const;
 
@@ -104,40 +101,40 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       const PointType& getLocalOrigin() const;
       const PointType& getSpaceSteps() const;
 
-      //aka MPI-communcicator  
-      void setCommunicationGroup(void * group);
-      void * getCommunicationGroup() const;
+      //aka MPI-communcicator
+      void setCommunicationGroup(MPI_Comm group);
+      MPI_Comm getCommunicationGroup() const;
 
       template< int EntityDimension >
       IndexType getEntitiesCount() const;
 
       template< typename Entity >
-      IndexType getEntitiesCount() const; 
+      IndexType getEntitiesCount() const;
 
       const int* getNeighbors() const;
-      
-      const int* getPeriodicNeighbors() const;      
 
-      template<typename CommunicatorType, typename DistributedGridType>
-      bool SetupByCut(DistributedGridType &inputDistributedGrid, 
-                 Containers::StaticVector<Dimension, int> savedDimensions, 
-                 Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions, 
+      const int* getPeriodicNeighbors() const;
+
+      template<typename DistributedGridType>
+      bool SetupByCut(DistributedGridType &inputDistributedGrid,
+                 Containers::StaticVector<Dimension, int> savedDimensions,
+                 Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions,
                  Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,IndexType> fixedIndexs);
 
       int getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const;
-      
+
       String printProcessCoords() const;
 
       String printProcessDistr() const;
-      
+
       void writeProlog( Logger& logger );
 
-   public: 
-      
+   public:
+
       bool isThereNeighbor(const CoordinatesType &direction) const;
 
       void setupNeighbors();
-      
+
       void print( std::ostream& str ) const;
 
       GridType globalGrid;
@@ -149,26 +146,26 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > >
       //CoordinatesType globalDimensions;
       CoordinatesType globalBegin;
       PointType spaceSteps;
-      
+
       SubdomainOverlapsType lowerOverlap, upperOverlap, globalLowerOverlap, globalUpperOverlap;
 
       CoordinatesType domainDecomposition;
-      CoordinatesType subdomainCoordinates;   
+      CoordinatesType subdomainCoordinates;
 
       // TODO: static arrays
       int neighbors[ getNeighborsCount() ];
       int periodicNeighbors[ getNeighborsCount() ];
 
-      IndexType Dimensions;        
+      IndexType Dimensions;
       bool distributed;
-        
+
       int rank;
       int nproc;
 
       bool isSet;
 
-      //aka MPI-communicator 
-      void * communicationGroup;
+      //aka MPI-communicator
+      MPI_Comm group;
 
 };
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
index a35b539629544c88e4946fcadff68cc19b6b4bc4..c48fec9af40e64d4f8dc12ff51c323e47360f09f 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
@@ -11,9 +11,9 @@
 #pragma once
 
 #include <cstdlib>
-#include <TNL/Communicators/MpiCommunicator.h>
 
 #include "DistributedGrid.h"
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
    namespace Meshes {
@@ -28,8 +28,6 @@ template<int Dimension, typename Real, typename Device, typename Index >
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 ~DistributedMesh()
 {
-    if(isSet && this->communicationGroup!=nullptr)
-        std::free(this->communicationGroup);
 }
 
 
@@ -57,7 +55,7 @@ setup( const Config::ParameterContainer& parameters,
    return true;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setDomainDecomposition( const CoordinatesType& domainDecomposition )
@@ -65,7 +63,7 @@ setDomainDecomposition( const CoordinatesType& domainDecomposition )
    this->domainDecomposition = domainDecomposition;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getDomainDecomposition() const
@@ -73,18 +71,12 @@ getDomainDecomposition() const
    return this->domainDecomposition;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
-template< typename CommunicatorType >
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setGlobalGrid( const GridType &globalGrid )
 {
-   if(this->isSet && this->communicationGroup != nullptr)
-        std::free(this->communicationGroup);
-   this->communicationGroup= std::malloc(sizeof(typename CommunicatorType::CommunicationGroup));
-
-   *((typename CommunicatorType::CommunicationGroup *)this->communicationGroup) = CommunicatorType::AllGroup;
-   auto group=*((typename CommunicatorType::CommunicationGroup *)this->communicationGroup);
+   this->group = MPI::AllGroup();
 
    this->globalGrid = globalGrid;
    this->isSet=true;
@@ -99,15 +91,12 @@ setGlobalGrid( const GridType &globalGrid )
    this->spaceSteps=globalGrid.getSpaceSteps();
    this->distributed=false;
 
-   if( CommunicatorType::IsInitialized() )
+   this->rank=MPI::GetRank(group);
+   this->nproc=MPI::GetSize(group);
+   //use MPI only if have more than one process
+   if(this->nproc>1)
    {
-      this->rank=CommunicatorType::GetRank(group);
-      this->nproc=CommunicatorType::GetSize(group);
-      //use MPI only if have more than one process
-      if(this->nproc>1)
-      {
-         this->distributed=true;
-      }
+      this->distributed=true;
    }
 
    if( !this->distributed )
@@ -127,10 +116,8 @@ setGlobalGrid( const GridType &globalGrid )
       //compute node distribution
       int dims[ Dimension ];
       for( int i = 0; i < Dimension; i++ )
-         dims[ i ]= this->domainDecomposition[ i ];
-
-
-      CommunicatorType::DimsCreate( this->nproc, Dimension, dims );
+         dims[ i ] = this->domainDecomposition[ i ];
+      MPI::Compute_dims( this->nproc, Dimension, dims );
       for( int i = 0; i < Dimension; i++ )
          this->domainDecomposition[ i ] = dims[ i ];
 
@@ -146,16 +133,16 @@ setGlobalGrid( const GridType &globalGrid )
       for( int i = 0; i < Dimension; i++ )
       {
          numberOfLarger[ i ] = globalGrid.getDimensions()[ i ] % this->domainDecomposition[ i ];
-         
+
          this->localSize[ i ] = globalGrid.getDimensions()[ i ] / this->domainDecomposition[ i ];
-         
+
          if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] )
             this->localSize[ i ] += 1;
-         
+
          if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] )
              this->globalBegin[ i ] = this->subdomainCoordinates[ i ] * this->localSize[ i ];
          else
-             this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + 
+             this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) +
                                      ( this->subdomainCoordinates[ i ] - numberOfLarger[ i ] ) * this->localSize[ i ];
       }
 
@@ -164,7 +151,7 @@ setGlobalGrid( const GridType &globalGrid )
   }
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setOverlaps( const SubdomainOverlapsType& lower,
@@ -191,7 +178,7 @@ setupGrid( GridType& grid)
    grid.setDistMesh(this);
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getSubdomainCoordinates() const
@@ -199,7 +186,7 @@ getSubdomainCoordinates() const
    return this->subdomainCoordinates;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalOrigin() const
@@ -207,15 +194,15 @@ getLocalOrigin() const
    return this->localOrigin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getSpaceSteps() const
 {
    return this->spaceSteps;
 }
-   
-template< int Dimension, typename Real, typename Device, typename Index >     
+
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isDistributed() const
@@ -223,7 +210,7 @@ isDistributed() const
    return this->distributed;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isBoundarySubdomain() const
@@ -234,7 +221,7 @@ isBoundarySubdomain() const
    return false;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLowerOverlap() const
@@ -242,7 +229,7 @@ getLowerOverlap() const
    return this->lowerOverlap;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getUpperOverlap() const
@@ -250,7 +237,7 @@ getUpperOverlap() const
    return this->upperOverlap;
 };
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalSize() const
@@ -258,7 +245,7 @@ getLocalSize() const
    return this->localSize;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalSize() const
@@ -266,7 +253,7 @@ getGlobalSize() const
    return this->globalGrid.getDimensions();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::GridType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalGrid() const
@@ -274,7 +261,7 @@ getGlobalGrid() const
     return this->globalGrid;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getGlobalBegin() const
@@ -282,7 +269,7 @@ getGlobalBegin() const
    return this->globalBegin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalGridSize() const
@@ -290,7 +277,7 @@ getLocalGridSize() const
    return this->localGridSize;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >     
+template< int Dimension, typename Real, typename Device, typename Index >
 const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType&
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getLocalBegin() const
@@ -298,7 +285,7 @@ getLocalBegin() const
    return this->localBegin;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >      
+template< int Dimension, typename Real, typename Device, typename Index >
    template< int EntityDimension >
 Index
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
@@ -307,7 +294,7 @@ getEntitiesCount() const
    return this->globalGrid. template getEntitiesCount< EntityDimension >();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >       
+template< int Dimension, typename Real, typename Device, typename Index >
    template< typename Entity >
 Index
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
@@ -316,23 +303,23 @@ getEntitiesCount() const
    return this->globalGrid. template getEntitiesCount< Entity >();
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
-void 
+template< int Dimension, typename Real, typename Device, typename Index >
+void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
-setCommunicationGroup(void * group)
+setCommunicationGroup(MPI_Comm group)
 {
-    this->communicationGroup=group;
+    this->group=group;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
-void *
+template< int Dimension, typename Real, typename Device, typename Index >
+MPI_Comm
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getCommunicationGroup() const
 {
-    return this->communicationGroup;
+    return this->group;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 int
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const
@@ -347,7 +334,7 @@ getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const
     return ret;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 isThereNeighbor(const CoordinatesType &direction) const
@@ -365,7 +352,7 @@ isThereNeighbor(const CoordinatesType &direction) const
 
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 setupNeighbors()
@@ -378,7 +365,7 @@ setupNeighbors()
          this->neighbors[ i ] = this->getRankOfProcCoord( coordinates );
       else
          this->neighbors[ i ] = -1;
-      
+
       // Handling periodic neighbors
       for( int d = 0; d < Dimension; d++ )
       {
@@ -388,12 +375,12 @@ setupNeighbors()
             coordinates[ d ] = 0;
          this->periodicNeighbors[ i ] = this->getRankOfProcCoord( coordinates );
       }
-      
+
       //std::cout << "Setting i-th neighbour to " << neighbors[ i ] << " and " << periodicNeighbors[ i ] << std::endl;
    }
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >   
+template< int Dimension, typename Real, typename Device, typename Index >
 const int*
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getNeighbors() const
@@ -402,7 +389,7 @@ getNeighbors() const
     return this->neighbors;
 }
 
-template< int Dimension, typename Real, typename Device, typename Index >   
+template< int Dimension, typename Real, typename Device, typename Index >
 const int*
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 getPeriodicNeighbors() const
@@ -412,12 +399,12 @@ getPeriodicNeighbors() const
 }
 
 template< int Dimension, typename Real, typename Device, typename Index >
-    template<typename CommunicatorType, typename DistributedGridType >
-bool 
+    template<typename DistributedGridType >
+bool
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
-SetupByCut(DistributedGridType &inputDistributedGrid, 
-         Containers::StaticVector<Dimension, int> savedDimensions, 
-         Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions, 
+SetupByCut(DistributedGridType &inputDistributedGrid,
+         Containers::StaticVector<Dimension, int> savedDimensions,
+         Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,int> reducedDimensions,
          Containers::StaticVector<DistributedGridType::getMeshDimension()-Dimension,IndexType> fixedIndexs)
 {
 
@@ -432,21 +419,17 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
       }
 
       //create new group with used nodes
-      typename CommunicatorType::CommunicationGroup *oldGroup=(typename CommunicatorType::CommunicationGroup *)(inputDistributedGrid.getCommunicationGroup());
-      if(this->isSet && this->communicationGroup != nullptr)
-            free(this->communicationGroup);
-      this->communicationGroup = std::malloc(sizeof(typename CommunicatorType::CommunicationGroup));
-
+      const MPI_Comm oldGroup=inputDistributedGrid.getCommunicationGroup();
       if(isInCut)
       {
            this->isSet=true;
-            
+
             auto fromGlobalMesh=inputDistributedGrid.getGlobalGrid();
             //set global grid
             typename GridType::PointType outOrigin;
             typename GridType::PointType outProportions;
             typename GridType::CoordinatesType outDimensions;
-            
+
             for(int i=0; i<Dimension;i++)
             {
                 outOrigin[i]=fromGlobalMesh.getOrigin()[savedDimensions[i]];
@@ -468,14 +451,13 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
                 this->spaceSteps[i]=inputDistributedGrid.getSpaceSteps()[savedDimensions[i]];
             }
 
-            int newRank= getRankOfProcCoord(this->subdomainCoordinates);
-
-            CommunicatorType::CreateNewGroup(isInCut,newRank,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup));
+            int newRank = getRankOfProcCoord(this->subdomainCoordinates);
+            this->group = MPI::Comm_split( oldGroup, 1, newRank );
 
             setupNeighbors();
 
 
-            
+
             bool isDistributed=false;
             for(int i=0;i<Dimension; i++)
             {
@@ -483,7 +465,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
             }
 
             this->distributed=isDistributed;
-            
+
             this->globalGrid.setDimensions(outDimensions);
             this->globalGrid.setDomain(outOrigin,outProportions);
 
@@ -491,7 +473,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid,
       }
       else
       {
-         CommunicatorType::CreateNewGroup(isInCut,0,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup));
+         this->group = MPI::Comm_split( oldGroup, MPI_UNDEFINED, 0 );
       }
 
       return false;
@@ -517,7 +499,7 @@ printProcessDistr() const
    for(int i=1; i<Dimension; i++)
         res=res+String("-")+convertToString(this->domainDecomposition[i]);
    return res;
-};  
+};
 
 template< int Dimension, typename Real, typename Device, typename Index >
 void
@@ -525,19 +507,18 @@ DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 writeProlog( Logger& logger )
 {
    logger.writeParameter( "Domain decomposition:", this->getDomainDecomposition() );
-}           
+}
 
-template< int Dimension, typename Real, typename Device, typename Index >    
+template< int Dimension, typename Real, typename Device, typename Index >
 void
 DistributedMesh< Grid< Dimension, Real, Device, Index > >::
 print( std::ostream& str ) const
 {
-   using Communicator = Communicators::MpiCommunicator;
-   for( int j = 0; j < Communicator::GetSize( Communicator::AllGroup ); j++ )
+   for( int j = 0; j < MPI::GetSize(); j++ )
    {
-      if( j == Communicator::GetRank( Communicator::AllGroup ) )
+      if( j == MPI::GetRank() )
       {
-         str << "Node : " << Communicator::GetRank( Communicator::AllGroup ) << std::endl
+         str << "Node : " << MPI::GetRank() << std::endl
              << " localOrigin : " << localOrigin << std::endl
              << " localBegin : " << localBegin << std::endl
              << " localSize : " << localSize  << std::endl
@@ -558,7 +539,7 @@ print( std::ostream& str ) const
             str << " " << periodicNeighbors[ i ];
          str << std::endl;
       }
-      Communicator::Barrier( Communicator::AllGroup );
+      MPI::Barrier();
    }
 }
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
index 38a7c04f0b5e1d3a86fe7cb30740dba2a242908d..edb08baf7b6cd909988e50446a19a1a66df42e6a 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
@@ -13,7 +13,6 @@
 #include <iostream>
 
 #include <TNL/File.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h>
 #include <TNL/Functions/MeshFunction.h>
@@ -21,11 +20,11 @@
 #include <TNL/Devices/Cuda.h>
 
 namespace TNL {
-namespace Meshes {   
+namespace Meshes {
 namespace DistributedMeshes {
 
 enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1, MpiIO=2 };
-    
+
 template< typename MeshFunction,
           DistrGridIOTypes type = LocalCopy,
           typename Mesh = typename MeshFunction::MeshType,
@@ -34,7 +33,7 @@ class DistributedGridIO
 {
 };
 
-template< typename MeshFunctionType > 
+template< typename MeshFunctionType >
 class DistributedGridIO< MeshFunctionType, Dummy >
 {
     bool save(const String& fileName, MeshFunctionType &meshFunction)
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
index 60605c6eb07514359e799026b73416eb404d9d95..698d7e41dc7ebbf5b2cf2481bb97c991620dcc96 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -12,6 +12,7 @@
 
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
+#include <TNL/MPI/getDataType.h>
 
 namespace TNL {
 namespace Meshes {
@@ -19,7 +20,7 @@ namespace DistributedMeshes {
 
 
 /*
- * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. 
+ * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap.
  * It is slow and has high RAM consumption
  */
 template< typename MeshFunction,
@@ -88,8 +89,8 @@ class DistributedGridIO<
          return true;
 
       };
-            
-    static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+
+    static bool load(const String& fileName,MeshFunctionType &meshFunction)
     {
         auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
         if(distrGrid==NULL) //not distributed
@@ -99,10 +100,10 @@ class DistributedGridIO<
         }
 
         const MeshType& mesh=meshFunction.getMesh();
-        
+
         PointType spaceSteps=mesh.getSpaceSteps();
         PointType origin=mesh.getOrigin();
-                
+
         CoordinatesType localSize=distrGrid->getLocalSize();
         CoordinatesType localBegin=distrGrid->getLocalBegin();
 
@@ -111,33 +112,33 @@ class DistributedGridIO<
         newMesh->setSpaceSteps(spaceSteps);
         CoordinatesType newOrigin;
         newMesh->setOrigin(origin+spaceSteps*localBegin);
-        
+
         VectorType newDof(newMesh-> template getEntitiesCount< typename MeshType::Cell >());
         MeshFunctionType newMeshFunction;
-        newMeshFunction.bind(newMesh,newDof); 
+        newMeshFunction.bind(newMesh,newDof);
 
         CoordinatesType zeroCoord;
-        zeroCoord.setValue(0);        
+        zeroCoord.setValue(0);
 
         File file;
         file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), std::ios_base::in );
         newMeshFunction.boundLoad(file);
         file.close();
         CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
-        
+
         return true;
     };
-    
+
 };
 
 /*
- * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, 
+ * Save distributed data into single file without overlaps using MPIIO and MPI datatypes,
  * EXPLOSIVE: works with only Grids and MPI
  * BAD IMPLEMENTTION creating MPI-Types at every save! -- I dont want contamine more places by MPI..
  */
 
 #ifdef HAVE_MPI
-template<typename MeshFunctionType> 
+template<typename MeshFunctionType>
 class DistributedGridIO_MPIIOBase
 {
    public:
@@ -152,13 +153,13 @@ class DistributedGridIO_MPIIOBase
     static bool save(const String& fileName, MeshFunctionType &meshFunction, RealType *data)
     {
 		auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
-        
+
         if(distrGrid==NULL) //not distributed
         {
             meshFunction.save(fileName);
         }
 
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
 
 	   MPI_File file;
       int ok=MPI_File_open( group,
@@ -168,7 +169,7 @@ class DistributedGridIO_MPIIOBase
                       &file);
       if( ok != 0 )
          throw std::runtime_error("Open file falied");
-      
+
 		int written=save(file,meshFunction, data,0);
 
         MPI_File_close(&file);
@@ -176,21 +177,21 @@ class DistributedGridIO_MPIIOBase
 		return written>0;
 
 	};
-    
+
     static int save(MPI_File &file, MeshFunctionType &meshFunction, RealType *data, int offset)
     {
 
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
 
        int headerSize;
-	   
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
-       if(Communicators::MpiCommunicator::GetRank(group)==0)
+       if(MPI::GetRank(group)==0)
        {
             MPI_File_seek(file,offset,MPI_SEEK_SET);
             headerSize=writeMeshFunctionHeader(file,meshFunction,dataCount);
@@ -200,9 +201,9 @@ class DistributedGridIO_MPIIOBase
 	   offset +=headerSize;
 
        MPI_File_set_view(file,offset,
-               Communicators::MPITypeResolver<RealType>::getType(),
+               TNL::MPI::getDataType<RealType>(),
                ftype,"native",MPI_INFO_NULL);
-       
+
        MPI_Status wstatus;
 
        MPI_File_write(file,data,1,atype,&wstatus);
@@ -222,7 +223,7 @@ class DistributedGridIO_MPIIOBase
         int fstarts[dim];
         int flsize[dim];
         int fgsize[dim];
-        
+
         hackArray(dim,fstarts,distrGrid->getGlobalBegin().getData());
         hackArray(dim,flsize,distrGrid->getLocalSize().getData());
         hackArray(dim,fgsize,distrGrid->getGlobalSize().getData());
@@ -230,14 +231,14 @@ class DistributedGridIO_MPIIOBase
         MPI_Type_create_subarray(dim,
             fgsize,flsize,fstarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype);
 
         MPI_Type_commit(ftype);
 
        int agsize[dim];
        int alsize[dim];
-       int astarts[dim]; 
+       int astarts[dim];
 
        hackArray(dim,astarts,distrGrid->getLocalBegin().getData());
        hackArray(dim,alsize,distrGrid->getLocalSize().getData());
@@ -246,7 +247,7 @@ class DistributedGridIO_MPIIOBase
        MPI_Type_create_subarray(dim,
             agsize,alsize,astarts,
             MPI_ORDER_C,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             atype);
        MPI_Type_commit(atype);
 
@@ -333,7 +334,7 @@ class DistributedGridIO_MPIIOBase
          return true;
       }
 
-      MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+      MPI_Comm group=distrGrid->getCommunicationGroup();
 
       MPI_File file;
       if( MPI_File_open( group,
@@ -350,39 +351,39 @@ class DistributedGridIO_MPIIOBase
       MPI_File_close(&file);
       return ret;
    }
-            
+
     /* Funky bomb - no checks - only dirty load */
-    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) 
+    static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset )
     {
        auto *distrGrid=meshFunction.getMesh().getDistributedMesh();
 
-       MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+       MPI_Comm group=distrGrid->getCommunicationGroup();
        MPI_Datatype ftype;
        MPI_Datatype atype;
        int dataCount=CreateDataTypes(distrGrid,&ftype,&atype);
-       
+
        MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL);
 
        int headerSize=0;
 
-       if(Communicators::MpiCommunicator::GetRank(group)==0)
+       if(MPI::GetRank(group)==0)
        {
             MPI_File_seek(file,offset,MPI_SEEK_SET);
             headerSize=readMeshFunctionHeader(file,meshFunction,dataCount);
        }
        MPI_Bcast(&headerSize, 1, MPI_INT,0, group);
-       
+
        if(headerSize<0)
             return false;
 
        offset+=headerSize;
 
        MPI_File_set_view(file,offset,
-            Communicators::MPITypeResolver<RealType>::getType(),
+            TNL::MPI::getDataType<RealType>(),
             ftype,"native",MPI_INFO_NULL);
        MPI_Status wstatus;
        MPI_File_read(file,(void*)data,1,atype,&wstatus);
-        
+
        MPI_Type_free(&atype);
        MPI_Type_free(&ftype);
 
@@ -412,7 +413,7 @@ class DistributedGridIO_MPIIOBase
         size+=count*sizeof(char);
         MPI_File_read(file, (void *)&count,1, MPI_INT, &rstatus);//DATACOUNT
         size+=1*sizeof(int);
-        
+
         if(count!=length)
         {
             std::cerr<<"Chyba načítání MeshFunction, délka dat v souboru neodpovídá očekávané délce" << std::endl;
@@ -421,7 +422,7 @@ class DistributedGridIO_MPIIOBase
 
         return size;
     };
-    
+
 };
 #endif
 
@@ -442,25 +443,25 @@ class DistributedGridIO<
       static bool save(const String& fileName, MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector=meshFunction.getData();
-            typename MeshFunctionType::RealType * data=hostVector.getData();  
+            typename MeshFunctionType::RealType * data=hostVector.getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
       };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
-            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >; 
+            using HostVectorType = Containers::Vector<typename MeshFunctionType::RealType, Devices::Host, typename MeshFunctionType::IndexType >;
             HostVectorType hostVector;
             hostVector.setLike(meshFunction.getData());
             auto* data=hostVector.getData();
@@ -469,7 +470,7 @@ class DistributedGridIO<
             return true;
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 };
@@ -491,26 +492,26 @@ class DistributedGridIO<
       static bool save(const String& fileName, MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             typename MeshFunctionType::RealType* data=meshFunction.getData().getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::save(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 
-      static bool load(const String& fileName,MeshFunctionType &meshFunction) 
+      static bool load(const String& fileName,MeshFunctionType &meshFunction)
       {
 #ifdef HAVE_MPI
-         if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+         if(MPI::isInitialized())//i.e. - isUsed
          {
             typename MeshFunctionType::RealType* data = meshFunction.getData().getData();
             return DistributedGridIO_MPIIOBase<MeshFunctionType>::load(fileName,meshFunction,data);
          }
 #endif
-         std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+         std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
          return false;
     };
 };
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
index 52217c336f8226854322e5cdd5ebcb29da108c47..8febf3c723b19bfffef001a70c6b3ed769a96420 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h
@@ -49,7 +49,7 @@ class DistributedGridIO_VectorField<
    static bool save(const String& fileName, Functions::VectorField< Size, MeshFunctionType > &vectorField)
    {
 #ifdef HAVE_MPI
-        if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+        if(MPI::isInitialized())//i.e. - isUsed
         {
             auto *distrGrid=vectorField.getMesh().getDistributedMesh();
 			if(distrGrid==NULL)
@@ -58,9 +58,9 @@ class DistributedGridIO_VectorField<
                                 return true;
 			}
 
-            MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+            MPI_Comm group=distrGrid->getCommunicationGroup();
 
-           //write 
+           //write
            MPI_File file;
            MPI_File_open( group,
                           const_cast< char* >( fileName.getString() ),
@@ -68,12 +68,12 @@ class DistributedGridIO_VectorField<
                           MPI_INFO_NULL,
                           &file);
 
-          
-           int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset      
-           if(Communicators::MpiCommunicator::GetRank(group)==0)
+
+           int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset
+           if(MPI::GetRank(group)==0)
                offset+=writeVectorFieldHeader(file,vectorField);
            MPI_Bcast(&offset, 1, MPI_INT,0, group);
-           
+
            for( int i = 0; i < vectorField.getVectorDimension(); i++ )
            {
                typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData();  //here manage data transfer Device...
@@ -83,13 +83,13 @@ class DistributedGridIO_VectorField<
                   return false;
            }
 
-           MPI_File_close(&file); 
-           return true;           
+           MPI_File_close(&file);
+           return true;
         }
 #endif
-        std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+        std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
         return false;
-      
+
     };
 
 #ifdef HAVE_MPI
@@ -140,7 +140,7 @@ class DistributedGridIO_VectorField<
     static bool load(const String& fileName, Functions::VectorField<Size,MeshFunctionType> &vectorField)
     {
 #ifdef HAVE_MPI
-        if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed
+        if(MPI::isInitialized())//i.e. - isUsed
         {
             auto *distrGrid=vectorField.getMesh().getDistributedMesh();
 			if(distrGrid==NULL)
@@ -149,9 +149,9 @@ class DistributedGridIO_VectorField<
                                 return true;
 			}
 
-            MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
+            MPI_Comm group=distrGrid->getCommunicationGroup();
 
-           //write 
+           //write
            MPI_File file;
            MPI_File_open( group,
                           const_cast< char* >( fileName.getString() ),
@@ -159,12 +159,12 @@ class DistributedGridIO_VectorField<
                           MPI_INFO_NULL,
                           &file);
 
-          
-           int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset      
-           if(Communicators::MpiCommunicator::GetRank(group)==0)
+
+           int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset
+           if(MPI::GetRank(group)==0)
                offset+=readVectorFieldHeader(file,vectorField);
            MPI_Bcast(&offset, 1, MPI_INT,0, group);
-           
+
            for( int i = 0; i < vectorField.getVectorDimension(); i++ )
            {
                typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData();  //here manage data transfer Device...
@@ -174,13 +174,13 @@ class DistributedGridIO_VectorField<
                   return false;
            }
 
-           MPI_File_close(&file); 
-           return true;           
+           MPI_File_close(&file);
+           return true;
         }
 #endif
-        std::cout << "MPIIO can be used only with MPICommunicator." << std::endl;
+        std::cout << "MPIIO can be used only when MPI is initialized." << std::endl;
         return false;
-      
+
     };
 
 };
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
index 5a11502403ca54be090c350b802e80d102926392..ed68150a041dc4ed209ac3a15ea226b96c801c6e 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
@@ -16,7 +16,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h>
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Pointers/SharedPointer.h>
 
 namespace TNL {
@@ -112,8 +111,7 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
          }
      }
 
-      template< typename CommunicatorType,
-                typename MeshFunctionType,
+      template< typename MeshFunctionType,
                 typename PeriodicBoundariesMaskPointer = Pointers::SharedPointer< MeshFunctionType > >
       void synchronize( MeshFunctionType &meshFunction,
                         bool periodicBoundaries = false,
@@ -145,9 +143,8 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
             PeriodicBoundariesMaskPointer( nullptr ) ); // the mask is used only when receiving data );
 
          //async send and receive
-         typename CommunicatorType::Request requests[2*this->getNeighborCount()];
-         typename CommunicatorType::CommunicationGroup group;
-         group=*((typename CommunicatorType::CommunicationGroup *)(distributedGrid->getCommunicationGroup()));
+         MPI_Request requests[2*this->getNeighborCount()];
+         MPI_Comm group = distributedGrid->getCommunicationGroup();
          int requestsCount( 0 );
 
          //send everything, recieve everything
@@ -159,22 +156,22 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea
             if( neighbors[ i ] != -1 )
             {
                //TNL_MPI_PRINT( "Sending data to node " << neighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
+               requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
                //TNL_MPI_PRINT( "Receiving data from node " << neighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
+               requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], neighbors[ i ], 0, group );
             }
             else if( periodicBoundaries && sendSizes[ i ] !=0 )
       	   {
                //TNL_MPI_PRINT( "Sending data to node " << periodicNeighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
+               requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast<RealType*>( sendBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
                //TNL_MPI_PRINT( "Receiving data to node " << periodicNeighbors[ i ] );
-               requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
+               requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast<RealType*>( recieveBuffers[ i ].getData() ),  sendSizes[ i ], periodicNeighbors[ i ], 1, group );
             }
          }
 
         //wait until send is done
         //TNL_MPI_PRINT( "Waiting for data ..." )
-        CommunicatorType::WaitAll( requests, requestsCount );
+        MPI::Waitall( requests, requestsCount );
 
         //copy data from receive buffers
         //TNL_MPI_PRINT( "Copying data ..." )
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
index 9a79f823d1379fbf4d314c9bc4bb3641fd5e9a78..21116d35725281aed1ea457c57bc19c370d395c1 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h
@@ -13,7 +13,7 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Meshes/DistributedMeshes/GlobalIndexStorage.h>
 #include <TNL/Meshes/MeshDetails/IndexPermutationApplier.h>
 
@@ -34,8 +34,6 @@ public:
    using PointType          = typename Mesh::PointType;
    using RealType           = typename PointType::RealType;
    using GlobalIndexArray   = typename Mesh::GlobalIndexArray;
-   using CommunicatorType   = Communicators::MpiCommunicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
    using VTKTypesArrayType  = Containers::Array< std::uint8_t, Devices::Sequential, GlobalIndexType >;
 
    DistributedMesh() = default;
@@ -101,12 +99,12 @@ public:
    /**
     * Methods specific to the distributed mesh
     */
-   void setCommunicationGroup( CommunicationGroup group )
+   void setCommunicationGroup( MPI_Comm group )
    {
       this->group = group;
    }
 
-   CommunicationGroup getCommunicationGroup() const
+   MPI_Comm getCommunicationGroup() const
    {
       return group;
    }
@@ -190,10 +188,10 @@ public:
       const GlobalIndexType verticesCount = localMesh.template getEntitiesCount< 0 >();
       const GlobalIndexType cellsCount = localMesh.template getEntitiesCount< Mesh::getMeshDimension() >();
 
-      CommunicatorType::Barrier();
-      for( int i = 0; i < CommunicatorType::GetSize(); i++ ) {
-         if( i == CommunicatorType::GetRank() ) {
-            str << "MPI rank:\t" << CommunicatorType::GetRank() << "\n"
+      MPI::Barrier();
+      for( int i = 0; i < MPI::GetSize(); i++ ) {
+         if( i == MPI::GetRank() ) {
+            str << "MPI rank:\t" << MPI::GetRank() << "\n"
                 << "\tMesh dimension:\t" << getMeshDimension() << "\n"
                 << "\tCell topology:\t" << getType( typename Cell::EntityTopology{} ) << "\n"
                 << "\tCells count:\t" << cellsCount << "\n"
@@ -230,13 +228,13 @@ public:
             }
             str.flush();
          }
-         CommunicatorType::Barrier();
+         MPI::Barrier();
       }
    }
 
 protected:
    MeshType localMesh;
-   CommunicationGroup group = CommunicatorType::NullGroup;
+   MPI_Comm group = MPI::NullGroup();
    int ghostLevels = 0;
 
    // vtkGhostType arrays for points and cells (cached for output into VTK formats)
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 724510bf4a6c576eafd5ba58d5d1065d4733a674..36f28ba458b67e872f9ea7d317f3654fb019215d 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -12,8 +12,10 @@
 
 #pragma once
 
+#include <TNL/Containers/ByteArraySynchronizer.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Meshes {
@@ -32,11 +34,22 @@ struct HasMeshType< T, typename Containers::Expressions::enable_if_type< typenam
 template< typename DistributedMesh,
           int EntityDimension = DistributedMesh::getMeshDimension() >
 class DistributedMeshSynchronizer
+: public Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >
 {
+   using Base = Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >;
+
 public:
    using DeviceType = typename DistributedMesh::DeviceType;
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
-   using CommunicatorType = typename DistributedMesh::CommunicatorType;
+   using ByteArrayView = typename Base::ByteArrayView;
+   using RequestsVector = typename Base::RequestsVector;
+
+   ~DistributedMeshSynchronizer()
+   {
+      // wait for pending async operation, otherwise it would crash
+      if( this->async_op.valid() )
+         this->async_op.wait();
+   }
 
    DistributedMeshSynchronizer() = default;
 
@@ -47,15 +60,9 @@ public:
       TNL_ASSERT_EQ( mesh.template getGlobalIndices< EntityDimension >().getSize(), mesh.getLocalMesh().template getEntitiesCount< EntityDimension >(),
                      "Global indices are not allocated properly." );
 
-      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
-      #ifdef HAVE_CUDA
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
-         cudaGetDevice(&this->gpu_id);
-      #endif
-
       group = mesh.getCommunicationGroup();
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // exchange the global index offsets so that each rank can determine the
       // owner of every entity by its global index
@@ -64,9 +71,9 @@ public:
       {
          Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc );
          sendbuf.setValue( ownStart );
-         CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                     globalOffsets.getData(), 1,
-                                     group );
+         MPI::Alltoall( sendbuf.getData(), 1,
+                        globalOffsets.getData(), 1,
+                        group );
       }
 
       // count local ghost entities for each rank
@@ -103,9 +110,9 @@ public:
          for( int j = 0; j < nproc; j++ )
          for( int i = 0; i < nproc; i++ )
             sendbuf.setElement( j, i, localGhostCounts[ i ] );
-         CommunicatorType::Alltoall( &sendbuf(0, 0), nproc,
-                                     &ghostEntitiesCounts(0, 0), nproc,
-                                     group );
+         MPI::Alltoall( &sendbuf(0, 0), nproc,
+                        &ghostEntitiesCounts(0, 0), nproc,
+                        group );
       }
 
       // allocate ghost offsets
@@ -122,14 +129,14 @@ public:
 
       // send indices of ghost entities - set them as ghost neighbors on the target rank
       {
-         std::vector< typename CommunicatorType::Request > requests;
+         RequestsVector requests;
 
          // send our ghost indices to the neighboring ranks
          GlobalIndexType ghostOffset = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >();
          ghostOffsets[ 0 ] = ghostOffset;
          for( int i = 0; i < nproc; i++ ) {
             if( ghostEntitiesCounts( rank, i ) > 0 ) {
-               requests.push_back( CommunicatorType::ISend(
+               requests.push_back( MPI::Isend(
                         mesh.template getGlobalIndices< EntityDimension >().getData() + ghostOffset,
                         ghostEntitiesCounts( rank, i ),
                         i, 0, group ) );
@@ -144,7 +151,7 @@ public:
          // receive ghost indices from the neighboring ranks
          for( int j = 0; j < nproc; j++ ) {
             if( ghostEntitiesCounts( j, rank ) > 0 ) {
-               requests.push_back( CommunicatorType::IRecv(
+               requests.push_back( MPI::Irecv(
                         ghostNeighbors.getData() + ghostNeighborOffsets[ j ],
                         ghostEntitiesCounts( j, rank ),
                         j, 0, group ) );
@@ -152,7 +159,7 @@ public:
          }
 
          // wait for all communications to finish
-         CommunicatorType::WaitAll( requests.data(), requests.size() );
+         MPI::Waitall( requests.data(), requests.size() );
 
          // convert received ghost indices from global to local
          ghostNeighbors -= ownStart;
@@ -182,43 +189,53 @@ public:
    template< typename Array >
    void synchronizeArray( Array& array, int valuesPerElement = 1 )
    {
-      TNL_ASSERT_EQ( array.getSize(), valuesPerElement * ghostOffsets[ ghostOffsets.getSize() - 1 ],
-                     "The array does not have the expected size." );
+      static_assert( std::is_same< typename Array::DeviceType, DeviceType >::value,
+                     "mismatched DeviceType of the array" );
       using ValueType = typename Array::ValueType;
 
-      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
-      #ifdef HAVE_CUDA
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
-         cudaSetDevice(gpu_id);
-      #endif
+      ByteArrayView view;
+      view.bind( reinterpret_cast<std::uint8_t*>( array.getData() ), sizeof(ValueType) * array.getSize() );
+      synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement );
+   }
 
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+   virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override
+   {
+      auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue );
+      MPI::Waitall( requests.data(), requests.size() );
+   }
+
+   virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override
+   {
+      TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ],
+                     "The array does not have the expected size." );
+
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // allocate send buffers (setSize does nothing if the array size is already correct)
-      sendBuffers.setSize( valuesPerElement * ghostNeighborOffsets[ nproc ] * sizeof(ValueType) );
+      sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > requests;
+      RequestsVector requests;
 
       // issue all receive async operations
       for( int j = 0; j < nproc; j++ ) {
          if( ghostEntitiesCounts( rank, j ) > 0 ) {
-            requests.push_back( CommunicatorType::IRecv(
-                     array.getData() + valuesPerElement * ghostOffsets[ j ],
-                     valuesPerElement * ghostEntitiesCounts( rank, j ),
+            requests.push_back( MPI::Irecv(
+                     array.getData() + bytesPerValue * ghostOffsets[ j ],
+                     bytesPerValue * ghostEntitiesCounts( rank, j ),
                      j, 0, group ) );
          }
       }
 
-      Containers::ArrayView< ValueType, DeviceType, GlobalIndexType > sendBuffersView;
-      sendBuffersView.bind( reinterpret_cast<ValueType*>( sendBuffers.getData() ), valuesPerElement * ghostNeighborOffsets[ nproc ] );
+      ByteArrayView sendBuffersView;
+      sendBuffersView.bind( sendBuffers.getData(), bytesPerValue * ghostNeighborOffsets[ nproc ] );
       const auto ghostNeighborsView = ghostNeighbors.getConstView();
       const auto arrayView = array.getConstView();
-      auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, valuesPerElement] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable
+      auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, bytesPerValue] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable
       {
-         for( int i = 0; i < valuesPerElement; i++ )
-            sendBuffersView[ i + valuesPerElement * (offset + k) ] = arrayView[ i + valuesPerElement * ghostNeighborsView[ offset + k ] ];
+         for( int i = 0; i < bytesPerValue; i++ )
+            sendBuffersView[ i + bytesPerValue * (offset + k) ] = arrayView[ i + bytesPerValue * ghostNeighborsView[ offset + k ] ];
       };
 
       for( int i = 0; i < nproc; i++ ) {
@@ -228,15 +245,14 @@ public:
             Algorithms::ParallelFor< DeviceType >::exec( (GlobalIndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset );
 
             // issue async send operation
-            requests.push_back( CommunicatorType::ISend(
-                     sendBuffersView.getData() + valuesPerElement * ghostNeighborOffsets[ i ],
-                     valuesPerElement * ghostEntitiesCounts( i, rank ),
+            requests.push_back( MPI::Isend(
+                     sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ],
+                     bytesPerValue * ghostEntitiesCounts( i, rank ),
                      i, 0, group ) );
          }
       }
 
-      // wait for all communications to finish
-      CommunicatorType::WaitAll( requests.data(), requests.size() );
+      return requests;
    }
 
    // performs a synchronization of a sparse matrix
@@ -252,11 +268,11 @@ public:
    {
       TNL_ASSERT_EQ( pattern.getRows(), ghostOffsets[ ghostOffsets.getSize() - 1 ], "invalid sparse pattern matrix" );
 
-      const int rank = CommunicatorType::GetRank( group );
-      const int nproc = CommunicatorType::GetSize( group );
+      const int rank = MPI::GetRank( group );
+      const int nproc = MPI::GetSize( group );
 
       // buffer for asynchronous communication requests
-      std::vector< typename CommunicatorType::Request > requests;
+      RequestsVector requests;
 
       Containers::Array< GlobalIndexType, Devices::Host, int > send_rankOffsets( nproc + 1 ), recv_rankOffsets( nproc + 1 );
       Containers::Array< GlobalIndexType, Devices::Host, GlobalIndexType > send_rowCapacities, send_rowPointers, send_columnIndices, recv_rowPointers, recv_columnIndices;
@@ -290,7 +306,7 @@ public:
             // send our row sizes to the target rank
             if( ! assumeConsistentRowCapacities )
                // issue async send operation
-               requests.push_back( CommunicatorType::ISend(
+               requests.push_back( MPI::Isend(
                         send_rowCapacities.getData() + send_rankOffsets[ i ],
                         ghostNeighborOffsets[ i + 1 ] - ghostNeighborOffsets[ i ],
                         i, 1, group ) );
@@ -318,7 +334,7 @@ public:
             if( send_rankOffsets[ i + 1 ] == send_rankOffsets[ i ] )
                continue;
             // issue async send operation
-            requests.push_back( CommunicatorType::ISend(
+            requests.push_back( MPI::Isend(
                      send_columnIndices.getData() + send_rowPointers[ send_rankOffsets[ i ] ],
                      send_rowPointers[ send_rankOffsets[ i + 1 ] ] - send_rowPointers[ send_rankOffsets[ i ] ],
                      i, 0, group ) );
@@ -335,7 +351,7 @@ public:
          // allocate row pointers
          recv_rowPointers.setSize( recv_rankOffsets[ nproc ] + 1 );
 
-         std::vector< typename CommunicatorType::Request > row_lengths_requests;
+         RequestsVector row_lengths_requests;
 
          // set row pointers
          GlobalIndexType rowPtr = 0;
@@ -353,7 +369,7 @@ public:
             else {
                // receive row sizes from the sender
                // issue async recv operation
-               row_lengths_requests.push_back( CommunicatorType::IRecv(
+               row_lengths_requests.push_back( MPI::Irecv(
                         recv_rowPointers.getData() + recv_rankOffsets[ i ],
                         ghostOffsets[ i + 1 ] - ghostOffsets[ i ],
                         i, 1, group ) );
@@ -362,7 +378,7 @@ public:
 
          if( ! assumeConsistentRowCapacities ) {
             // wait for all row lengths
-            CommunicatorType::WaitAll( row_lengths_requests.data(), row_lengths_requests.size() );
+            MPI::Waitall( row_lengths_requests.data(), row_lengths_requests.size() );
 
             // scan the rowPointers array to convert
             Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView;
@@ -377,7 +393,7 @@ public:
             if( recv_rankOffsets[ i + 1 ] == recv_rankOffsets[ i ] )
                continue;
             // issue async recv operation
-            requests.push_back( CommunicatorType::IRecv(
+            requests.push_back( MPI::Irecv(
                      recv_columnIndices.getData() + recv_rowPointers[ recv_rankOffsets[ i ] ],
                      recv_rowPointers[ recv_rankOffsets[ i + 1 ] ] - recv_rowPointers[ recv_rankOffsets[ i ] ],
                      i, 0, group ) );
@@ -385,7 +401,7 @@ public:
       }
 
       // wait for all communications to finish
-      CommunicatorType::WaitAll( requests.data(), requests.size() );
+      MPI::Waitall( requests.data(), requests.size() );
 
       return std::make_tuple( recv_rankOffsets, recv_rowPointers, recv_columnIndices );
    }
@@ -428,11 +444,8 @@ public:
    }
 
 protected:
-   // GOTCHA (see above)
-   int gpu_id = 0;
-
    // communication group taken from the distributed mesh
-   typename CommunicatorType::CommunicationGroup group;
+   MPI_Comm group;
 
    /**
     * Global offsets: array of size nproc where the i-th value is the lowest
diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
index 851ff66273fcc97f705aa37d5f6e1c8af3336260..b479544f7e1ebccb739027758d046a4263aa25e6 100644
--- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
+++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h
@@ -16,23 +16,21 @@
 namespace TNL {
    namespace Meshes {
       namespace DistributedMeshes {
-      
-template< typename Mesh,
-          typename Communicator >
+
+template< typename Mesh >
 class SubdomainOverlapsGetter
 {};
 
-// TODO: Specializations by the grid dimension can be avoided when the MPI directions are 
+// TODO: Specializations by the grid dimension can be avoided when the MPI directions are
 // rewritten in a dimension independent way
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 1;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -40,10 +38,9 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1,
        */
@@ -53,18 +50,17 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 2;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -72,10 +68,9 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * lower.y() is overlap of the subdomain at boundary where y = 0,
        * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1,
@@ -87,17 +82,16 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
-class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
+          typename Index >
+class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >
 {
    public:
-      
+
       static const int Dimension = 3;
       using MeshType = Grid< Dimension, Real, Device, Index >;
       using DeviceType = Device;
@@ -105,10 +99,9 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
       using DistributedMeshType = DistributedMesh< MeshType >;
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using CoordinatesType = typename DistributedMeshType::CoordinatesType;
-      using CommunicatorType = Communicator;
-      
+
       // Computes subdomain overlaps
-      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. 
+      /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions.
        * lower.x() is overlap of the subdomain at boundary where x = 0,
        * lower.y() is overlap of the subdomain at boundary where y = 0,
        * lower.z() is overlap of the subdomain at boundary where z = 0,
@@ -122,7 +115,7 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >
                                IndexType subdomainOverlapSize,
                                const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0,
                                const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 );
-   
+
 };
 
 
diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
index 9dbb1372b061007b55082a81819d11888000bd74..aa185e1ecf0d08193feb8a58abeb40785914eead 100644
--- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Assert.h>
 #include <TNL/Meshes/Grid.h>
 
@@ -19,26 +20,25 @@ namespace TNL {
 
 /*
  * TODO: This could work when the MPI directions are rewritten
-         
+
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
              IndexType subdomainOverlapSize,
              const SubdomainOverlapsType& periodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( ! MPI::isDistributed() )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    for( int i = 0; i < Dimension; i++ )
    {
       CoordinatesType neighborDirection( 0 );
@@ -47,7 +47,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
          lower[ i ] = subdomainOverlapSize;
       else if( distributedMesh->getPeriodicNeighbors()[ Directions::getDirection( neighborDirection ) ] != rank )
          lower[ i ] = periodicBoundariesOverlapSize[ i ];
-      
+
       neighborDirection[ i ] = 1;
       if( subdomainCoordinates[ i ] < distributedMesh->getDomainDecomposition()[ i ] - 1 )
          upper[ i ] = subdomainOverlapSize;
@@ -55,15 +55,14 @@ getOverlaps( const DistributedMeshType* distributedMesh,
          upper[ i ] = periodicBoundariesOverlapSize[ i ];
    }
 }
- 
+
 */
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -71,13 +70,13 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -92,10 +91,9 @@ getOverlaps( const DistributedMeshType* distributedMesh,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -103,15 +101,15 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
+   int rank = MPI::GetRank();
    lower = 0;
    upper = 0;
-   
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -121,7 +119,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank )
       upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ];
-   
+
    if( subdomainCoordinates[ 1 ] > 0 )
       lower[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank )
@@ -135,10 +133,9 @@ getOverlaps( const DistributedMeshType* distributedMesh,
 
 template< typename Real,
           typename Device,
-          typename Index,
-          typename Communicator >
+          typename Index >
 void
-SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >::
+SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >::
 getOverlaps( const DistributedMeshType* distributedMesh,
              SubdomainOverlapsType& lower,
              SubdomainOverlapsType& upper,
@@ -146,13 +143,13 @@ getOverlaps( const DistributedMeshType* distributedMesh,
              const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize,
              const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize )
 {
-   if( ! CommunicatorType::isDistributed() )
+   if( MPI::GetSize() == 1 )
       return;
    TNL_ASSERT_TRUE( distributedMesh != NULL, "" );
 
    const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates();
-   int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup );
-   
+   int rank = MPI::GetRank();
+
    if( subdomainCoordinates[ 0 ] > 0 )
       lower[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank )
@@ -162,7 +159,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 0 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank )
       upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ];
-   
+
    if( subdomainCoordinates[ 1 ] > 0 )
       lower[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank )
@@ -172,7 +169,7 @@ getOverlaps( const DistributedMeshType* distributedMesh,
       upper[ 1 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZzYpXz ] != rank )
       upper[ 1 ] = upperPeriodicBoundariesOverlapSize[ 1 ];
-   
+
    if( subdomainCoordinates[ 2 ] > 0 )
       lower[ 2 ] = subdomainOverlapSize;
    else if( distributedMesh->getPeriodicNeighbors()[ ZmYzXz ] != rank )
diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
index 63a10b1cf08bd6c121402fa792c1f77e002f7516..120cadf808f7d4171d80e4f7ac375939ad1caf17 100644
--- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
+++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
@@ -19,14 +19,14 @@ namespace TNL {
 namespace Meshes {
 namespace DistributedMeshes {
 
-template< typename CommunicatorType, typename GlobalIndexType >
+template< typename GlobalIndexType >
 auto
-exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
+exchangeGhostEntitySeeds( MPI_Comm group,
                           const std::vector< std::vector< GlobalIndexType > >& seeds_vertex_indices,
                           const std::vector< std::vector< GlobalIndexType > >& seeds_entity_offsets )
 {
-   const int rank = CommunicatorType::GetRank( group );
-   const int nproc = CommunicatorType::GetSize( group );
+   const int rank = MPI::GetRank( group );
+   const int nproc = MPI::GetSize( group );
 
    // exchange sizes of the arrays
    Containers::Array< GlobalIndexType, Devices::Host, int > sizes_vertex_indices( nproc ), sizes_entity_offsets( nproc );
@@ -36,12 +36,12 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
          sendbuf_indices[ i ] = seeds_vertex_indices[ i ].size();
          sendbuf_offsets[ i ] = seeds_entity_offsets[ i ].size();
       }
-      CommunicatorType::Alltoall( sendbuf_indices.getData(), 1,
-                                  sizes_vertex_indices.getData(), 1,
-                                  group );
-      CommunicatorType::Alltoall( sendbuf_offsets.getData(), 1,
-                                  sizes_entity_offsets.getData(), 1,
-                                  group );
+      MPI::Alltoall( sendbuf_indices.getData(), 1,
+                     sizes_vertex_indices.getData(), 1,
+                     group );
+      MPI::Alltoall( sendbuf_offsets.getData(), 1,
+                     sizes_entity_offsets.getData(), 1,
+                     group );
    }
 
    // allocate arrays for the results
@@ -54,17 +54,17 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
    }
 
    // buffer for asynchronous communication requests
-   std::vector< typename CommunicatorType::Request > requests;
+   std::vector< MPI_Request > requests;
 
    // issue all async receive operations
    for( int j = 0; j < nproc; j++ ) {
       if( j == rank )
           continue;
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                foreign_seeds_vertex_indices[ j ].data(),
                foreign_seeds_vertex_indices[ j ].size(),
                j, 0, group ) );
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                foreign_seeds_entity_offsets[ j ].data(),
                foreign_seeds_entity_offsets[ j ].size(),
                j, 1, group ) );
@@ -74,30 +74,30 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group,
    for( int i = 0; i < nproc; i++ ) {
       if( i == rank )
           continue;
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                seeds_vertex_indices[ i ].data(),
                seeds_vertex_indices[ i ].size(),
                i, 0, group ) );
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                seeds_entity_offsets[ i ].data(),
                seeds_entity_offsets[ i ].size(),
                i, 1, group ) );
    }
 
    // wait for all communications to finish
-   CommunicatorType::WaitAll( requests.data(), requests.size() );
+   MPI::Waitall( requests.data(), requests.size() );
 
    return std::make_tuple( foreign_seeds_vertex_indices, foreign_seeds_entity_offsets );
 }
 
-template< typename CommunicatorType, typename GlobalIndexType >
+template< typename GlobalIndexType >
 auto
-exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
+exchangeGhostIndices( MPI_Comm group,
                       const std::vector< std::vector< GlobalIndexType > >& foreign_ghost_indices,
                       const std::vector< std::vector< GlobalIndexType > >& seeds_local_indices )
 {
-   const int rank = CommunicatorType::GetRank( group );
-   const int nproc = CommunicatorType::GetSize( group );
+   const int rank = MPI::GetRank( group );
+   const int nproc = MPI::GetSize( group );
 
    // allocate arrays for the results
    std::vector< std::vector< GlobalIndexType > > ghost_indices;
@@ -106,13 +106,13 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
       ghost_indices[ i ].resize( seeds_local_indices[ i ].size() );
 
    // buffer for asynchronous communication requests
-   std::vector< typename CommunicatorType::Request > requests;
+   std::vector< MPI_Request > requests;
 
    // issue all async receive operations
    for( int j = 0; j < nproc; j++ ) {
       if( j == rank )
           continue;
-      requests.push_back( CommunicatorType::IRecv(
+      requests.push_back( MPI::Irecv(
                ghost_indices[ j ].data(),
                ghost_indices[ j ].size(),
                j, 0, group ) );
@@ -122,14 +122,14 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group,
    for( int i = 0; i < nproc; i++ ) {
       if( i == rank )
           continue;
-      requests.push_back( CommunicatorType::ISend(
+      requests.push_back( MPI::Isend(
                foreign_ghost_indices[ i ].data(),
                foreign_ghost_indices[ i ].size(),
                i, 0, group ) );
    }
 
    // wait for all communications to finish
-   CommunicatorType::WaitAll( requests.data(), requests.size() );
+   MPI::Waitall( requests.data(), requests.size() );
 
    return ghost_indices;
 }
@@ -145,7 +145,6 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    using GlobalIndexType = typename DistributedMesh::GlobalIndexType;
    using LocalIndexType = typename DistributedMesh::LocalIndexType;
    using LocalMesh = typename DistributedMesh::MeshType;
-   using CommunicatorType = typename DistributedMesh::CommunicatorType;
 
    static_assert( ! std::is_same< DeviceType, Devices::Cuda >::value,
                   "this method can be called only for host meshes" );
@@ -154,8 +153,8 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    if( mesh.getGhostLevels() <= 0 )
       throw std::logic_error( "There are no ghost levels on the distributed mesh." );
 
-   const int rank = CommunicatorType::GetRank( mesh.getCommunicationGroup() );
-   const int nproc = CommunicatorType::GetSize( mesh.getCommunicationGroup() );
+   const int rank = MPI::GetRank( mesh.getCommunicationGroup() );
+   const int nproc = MPI::GetSize( mesh.getCommunicationGroup() );
 
    // 0. exchange cell data to prepare getCellOwner for use in getEntityOwner
    DistributedMeshSynchronizer< DistributedMesh, DistributedMesh::getMeshDimension() > cell_synchronizer;
@@ -235,9 +234,9 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
 
       Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc );
       sendbuf.setValue( localEntitiesCount );
-      CommunicatorType::Alltoall( sendbuf.getData(), 1,
-                                  globalOffsets.getData(), 1,
-                                  mesh.getCommunicationGroup() );
+      MPI::Alltoall( sendbuf.getData(), 1,
+                     globalOffsets.getData(), 1,
+                     mesh.getCommunicationGroup() );
    }
    globalOffsets.template scan< Algorithms::ScanType::Exclusive >();
 
@@ -288,7 +287,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
    }
 
    // 5. exchange seeds for ghost entities
-   const auto foreign_seeds = exchangeGhostEntitySeeds< CommunicatorType >( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets );
+   const auto foreign_seeds = exchangeGhostEntitySeeds( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets );
    const auto& foreign_seeds_vertex_indices = std::get< 0 >( foreign_seeds );
    const auto& foreign_seeds_entity_offsets = std::get< 1 >( foreign_seeds );
 
@@ -373,7 +372,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
       });
 
       // 6b. exchange global ghost indices
-      const auto ghost_indices = exchangeGhostIndices< CommunicatorType >( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices );
+      const auto ghost_indices = exchangeGhostIndices( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices );
 
       // 6c. set the global indices of our ghost entities
       bool done = true;
@@ -387,7 +386,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
 
       // 6d. check if finished
       bool all_done = false;
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
       if( all_done )
          break;
    }
diff --git a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
index 135e3c15a8c5f64b3032b2d2dba28a0b3d964bd9..52c0b543b641a6a9a433109c7f380063d16ac66e 100644
--- a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
+++ b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h
@@ -94,8 +94,7 @@ resolveAndLoadDistributedMesh( Functor&& functor,
    return resolveDistributedMeshType< ConfigTag, Device >( wrapper, fileName, fileFormat );
 }
 
-template< typename CommunicatorType,
-          typename MeshConfig,
+template< typename MeshConfig,
           typename Device >
 bool
 loadDistributedMesh( Mesh< MeshConfig, Device >& mesh,
@@ -145,8 +144,7 @@ decomposeMesh( const Config::ParameterContainer& parameters,
 }
 
 // overloads for grids
-template< typename CommunicatorType,
-          int Dimension,
+template< int Dimension,
           typename Real,
           typename Device,
           typename Index >
@@ -171,7 +169,7 @@ loadDistributedMesh( Grid< Dimension, Real, Device, Index >& mesh,
    std::cout << " [ OK ] " << std::endl;
 
    typename Meshes::DistributedMeshes::DistributedMesh<Grid< Dimension, Real, Device, Index >>::SubdomainOverlapsType overlap;
-   distributedMesh.template setGlobalGrid< CommunicatorType >( globalGrid );
+   distributedMesh.setGlobalGrid( globalGrid );
    distributedMesh.setupGrid(mesh);
    return true;
 }
@@ -191,7 +189,6 @@ decomposeMesh( const Config::ParameterContainer& parameters,
    using GridType = Grid< Dimension, Real, Device, Index >;
    using DistributedGridType = DistributedMeshes::DistributedMesh< GridType >;
    using SubdomainOverlapsType = typename DistributedGridType::SubdomainOverlapsType;
-   using CommunicatorType = typename Problem::CommunicatorType;
 
    SubdomainOverlapsType lower( 0 ), upper( 0 );
    distributedMesh.setOverlaps( lower, upper );
diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h
index 6e869f6ec2655797d56b11adee010160eeb2890f..addef6b9f3d01839d81bfeb64c8cf948405d8942 100644
--- a/src/TNL/Meshes/Geometry/getEntityCenter.h
+++ b/src/TNL/Meshes/Geometry/getEntityCenter.h
@@ -39,7 +39,7 @@ getEntityCenter( const Mesh< MeshConfig, Device > & mesh,
 /*
  * Get an arithmetic mean of the entity's subvertices.
  *
- * For an simplex entity this corresponds to the centroid of the entity, but
+ * For a simplex entity this corresponds to the centroid of the entity, but
  * note that other shapes such as general polygons have different formulas for
  * the centroid: https://en.wikipedia.org/wiki/Centroid#Centroid_of_a_polygon
  */
diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h
index 70d5614ce9de85691da7d83f53eb65fabcb9f695..fb1e2d468b097b9a292d5d901bdbe2f32630e565 100644
--- a/src/TNL/Meshes/Geometry/getEntityMeasure.h
+++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h
@@ -19,6 +19,7 @@
 #include <TNL/Meshes/Topologies/Triangle.h>
 #include <TNL/Meshes/Topologies/Quadrangle.h>
 #include <TNL/Meshes/Topologies/Tetrahedron.h>
+#include <TNL/Meshes/Topologies/Hexahedron.h>
 
 namespace TNL {
 namespace Meshes {
@@ -148,5 +149,28 @@ getEntityMeasure( const Mesh< MeshConfig, Device > & mesh,
     return getTetrahedronVolume( v3 - v0, v2 - v0, v1 - v0 );
 }
 
+template< typename MeshConfig, typename Device >
+__cuda_callable__
+typename MeshConfig::RealType
+getEntityMeasure( const Mesh< MeshConfig, Device > & mesh,
+                  const MeshEntity< MeshConfig, Device, Topologies::Hexahedron > & entity )
+{
+    const auto& v0 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 0 ) );
+    const auto& v1 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 1 ) );
+    const auto& v2 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 2 ) );
+    const auto& v3 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 3 ) );
+    const auto& v4 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 4 ) );
+    const auto& v5 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 5 ) );
+    const auto& v6 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 6 ) );
+    const auto& v7 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 7 ) );
+    // https://www.cfd-online.com/Forums/main/163122-volume-general-hexahedron.html#post574650
+    return getTetrahedronVolume( v0 - v4, v3 - v4, v1 - v4 )
+         + getTetrahedronVolume( v2 - v4, v3 - v4, v1 - v4 )
+         + getTetrahedronVolume( v1 - v4, v2 - v4, v5 - v4 )
+         + getTetrahedronVolume( v6 - v4, v2 - v4, v5 - v4 )
+         + getTetrahedronVolume( v3 - v4, v2 - v4, v7 - v4 )
+         + getTetrahedronVolume( v6 - v4, v2 - v4, v7 - v4 );
+}
+
 } // namespace Meshes
 } // namespace TNL
diff --git a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
index 53680086264bd6a803ce26307264d6a28171a387..d3fa6ea50fef14482bc913aaf0fe2fec62f6c110 100644
--- a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
+++ b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
+#include <TNL/Meshes/Topologies/Edge.h>
 
 namespace TNL {
 namespace Meshes {
@@ -87,5 +88,63 @@ getOutwardNormalVector( const Grid & grid,
    }
 }
 
+template< typename MeshConfig, typename Device >
+__cuda_callable__
+typename MeshTraits< MeshConfig >::PointType
+getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh,
+                        const MeshEntity< MeshConfig, Device, Topologies::Edge > & face,
+                        typename MeshTraits< MeshConfig >::PointType cellCenter )
+{
+   using MeshType = Mesh< MeshConfig, Device >;
+   using FaceType = MeshEntity< MeshConfig, Device, Topologies::Edge >;
+   using PointType = typename MeshTraits< MeshConfig >::PointType;
+   static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" );
+   static_assert( MeshConfig::worldDimension == 2, "TODO: normal vectors for 2D meshes in a 3D space are not implemented yet" );
+
+   const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) );
+   const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) );
+   const PointType u = v0 - v1;
+   const PointType n {u[1], -u[0]};
+
+   // check on which side of the face is the reference cell center
+   const PointType faceCenter = getEntityCenter( mesh, face );
+   if( dot( n, cellCenter - faceCenter ) < 0 )
+      return n / l2Norm( n );
+   else
+      return - n / l2Norm( n );
+}
+
+template< typename MeshConfig, typename Device, typename EntityTopology >
+__cuda_callable__
+typename MeshTraits< MeshConfig >::PointType
+getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh,
+                        const MeshEntity< MeshConfig, Device, EntityTopology > & face,
+                        typename MeshTraits< MeshConfig >::PointType cellCenter )
+{
+   using MeshType = Mesh< MeshConfig, Device >;
+   using FaceType = MeshEntity< MeshConfig, Device, EntityTopology >;
+   using PointType = typename MeshTraits< MeshConfig >::PointType;
+   static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" );
+   static_assert( MeshConfig::worldDimension == 3, "general overload intended for 3D was called with the wrong world dimension" );
+
+   const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) );
+   const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) );
+   const auto& v2 = mesh.getPoint( face.template getSubentityIndex< 0 >( 2 ) );
+   const PointType u1 = v0 - v1;
+   const PointType u2 = v0 - v2;
+   const PointType n {
+      u1.y() * u2.z() - u1.z() * u2.y(),   // first component of the cross product
+      u1.z() * u2.x() - u1.x() * u2.z(),   // second component of the cross product
+      u1.x() * u2.y() - u1.y() * u2.x()    // third component of the cross product
+   };
+
+   // check on which side of the face is the reference cell center
+   const PointType faceCenter = getEntityCenter( mesh, face );
+   if( dot( n, cellCenter - faceCenter ) < 0 )
+      return n / l2Norm( n );
+   else
+      return - n / l2Norm( n );
+}
+
 } // namespace Meshes
 } // namespace TNL
diff --git a/src/TNL/Meshes/Readers/MeshReader.h b/src/TNL/Meshes/Readers/MeshReader.h
index 88e2986bad3b8590ac53f6112202486348fcc7a8..8bf8189ba161899a54527ad7c0fee281ee9c246b 100644
--- a/src/TNL/Meshes/Readers/MeshReader.h
+++ b/src/TNL/Meshes/Readers/MeshReader.h
@@ -150,6 +150,18 @@ public:
          throw MeshReaderError( "VTKReader", "MeshBuilder failed" );
    }
 
+   virtual VariantVector
+   readPointData( std::string arrayName )
+   {
+      throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." );
+   }
+
+   virtual VariantVector
+   readCellData( std::string arrayName )
+   {
+      throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." );
+   }
+
    std::string
    getMeshType() const
    {
diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h
index 666aa4f453478c260240fb45a1dc78d8698ce39e..725aa7fec4baaf797afdda3908f126b728744fc2 100644
--- a/src/TNL/Meshes/Readers/PVTUReader.h
+++ b/src/TNL/Meshes/Readers/PVTUReader.h
@@ -14,7 +14,7 @@
 
 #include <experimental/filesystem>
 
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Meshes/Readers/VTUReader.h>
 #include <TNL/Meshes/MeshDetails/layers/EntityTags/Traits.h>
 
@@ -67,13 +67,13 @@ class PVTUReader
          throw MeshReaderError( "PVTUReader", "the file does not contain any <Piece> element." );
 
       // check that the number of pieces matches the number of MPI ranks
-      const int nproc = CommunicatorType::GetSize( group );
+      const int nproc = MPI::GetSize( group );
       if( (int) pieceSources.size() != nproc )
          throw MeshReaderError( "PVTUReader", "the number of subdomains does not match the number of MPI ranks ("
                                               + std::to_string(pieceSources.size()) + " vs " + std::to_string(nproc) + ")." );
 
       // read the local piece source
-      const int rank = CommunicatorType::GetRank( group );
+      const int rank = MPI::GetRank( group );
       localReader.setFileName( pieceSources[ rank ] );
       localReader.detectMesh();
 
@@ -100,12 +100,9 @@ class PVTUReader
 #endif
 
 public:
-   using CommunicatorType = Communicators::MpiCommunicator;
-   using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
-
    PVTUReader() = default;
 
-   PVTUReader( const std::string& fileName, CommunicationGroup group = CommunicatorType::AllGroup )
+   PVTUReader( const std::string& fileName, MPI_Comm group = MPI::AllGroup() )
    : XMLVTK( fileName ), group( group )
    {}
 
@@ -211,6 +208,18 @@ public:
       mesh.setCommunicationGroup( group );
    }
 
+   virtual VariantVector
+   readPointData( std::string arrayName ) override
+   {
+      return localReader.readPointData( arrayName );
+   }
+
+   virtual VariantVector
+   readCellData( std::string arrayName ) override
+   {
+      return localReader.readCellData( arrayName );
+   }
+
    virtual void reset() override
    {
       resetBase();
@@ -221,7 +230,7 @@ public:
    }
 
 protected:
-   CommunicationGroup group;
+   MPI_Comm group;
 
    int ghostLevels = 0;
    int minCommonVertices = 0;
diff --git a/src/TNL/Meshes/Readers/XMLVTK.h b/src/TNL/Meshes/Readers/XMLVTK.h
index fb8e1eb40df1919a7a57b86db8978fde042b2262..af864e6e9c603f5998c6c29b9a91a0b43c64167d 100644
--- a/src/TNL/Meshes/Readers/XMLVTK.h
+++ b/src/TNL/Meshes/Readers/XMLVTK.h
@@ -325,8 +325,8 @@ public:
 #endif
    }
 
-   VariantVector
-   readPointData( std::string arrayName )
+   virtual VariantVector
+   readPointData( std::string arrayName ) override
    {
 #ifdef HAVE_TINYXML2
       return readPointOrCellData( "PointData", arrayName );
@@ -335,8 +335,8 @@ public:
 #endif
    }
 
-   VariantVector
-   readCellData( std::string arrayName )
+   virtual VariantVector
+   readCellData( std::string arrayName ) override
    {
 #ifdef HAVE_TINYXML2
       return readPointOrCellData( "CellData", arrayName );
diff --git a/src/TNL/Meshes/Readers/getMeshReader.h b/src/TNL/Meshes/Readers/getMeshReader.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c2c18a8e3da03a923a62be80629aec81ce6d246
--- /dev/null
+++ b/src/TNL/Meshes/Readers/getMeshReader.h
@@ -0,0 +1,58 @@
+/***************************************************************************
+                          getMeshReader.h  -  description
+                             -------------------
+    begin                : Nov 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovský
+
+#pragma once
+
+#include <experimental/filesystem>
+
+#include <TNL/Meshes/Readers/NetgenReader.h>
+#include <TNL/Meshes/Readers/VTKReader.h>
+#include <TNL/Meshes/Readers/VTUReader.h>
+#include <TNL/Meshes/Readers/PVTUReader.h>
+
+namespace TNL {
+namespace Meshes {
+namespace Readers {
+
+std::shared_ptr< Readers::MeshReader >
+getMeshReader( const std::string& fileName,
+               const std::string& fileFormat )
+{
+   namespace fs = std::experimental::filesystem;
+   std::string format = fileFormat;
+   if( format == "auto" ) {
+      format = fs::path(fileName).extension();
+      if( format.length() > 0 )
+         // remove dot from the extension
+         format = format.substr(1);
+   }
+
+   if( format == "ng" )
+      return std::make_shared< Readers::NetgenReader >( fileName );
+   else if( format == "vtk" )
+      return std::make_shared< Readers::VTKReader >( fileName );
+   else if( format == "vtu" )
+      return std::make_shared< Readers::VTUReader >( fileName );
+   else if( format == "pvtu" )
+      return std::make_shared< Readers::PVTUReader >( fileName );
+
+   if( fileFormat == "auto" )
+      std::cerr << "File '" << fileName << "' has unsupported format (based on the file extension): " << format << ".";
+   else
+      std::cerr << "Unsupported fileFormat parameter: " << fileFormat << ".";
+   std::cerr << " Supported formats are 'vtk', 'vtu', 'pvtu' and 'ng'." << std::endl;
+   return nullptr;
+}
+
+} // namespace Readers
+} // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/VTKTraits.h b/src/TNL/Meshes/VTKTraits.h
index e09b6c34205aeeeb95e51d7a56a730b82c07c837..0883b607a54ab755b96907ee243dca614f14f08b 100644
--- a/src/TNL/Meshes/VTKTraits.h
+++ b/src/TNL/Meshes/VTKTraits.h
@@ -172,16 +172,16 @@ enum class CellGhostTypes
    DUPLICATECELL = 1,        // the cell is present on multiple processors
    HIGHCONNECTIVITYCELL = 2, // the cell has more neighbors than in a regular mesh
    LOWCONNECTIVITYCELL = 4,  // the cell has less neighbors than in a regular mesh
-   REFINEDCELL = 8,          // other cells are present that refines it.
+   REFINEDCELL = 8,          // other cells are present that refines it
    EXTERIORCELL = 16,        // the cell is on the exterior of the data set
-   HIDDENCELL = 32           // the cell is needed to maintain connectivity, but the data values should be ignored.
+   HIDDENCELL = 32           // the cell is needed to maintain connectivity, but the data values should be ignored
 };
 
 enum class PointGhostTypes
 : std::uint8_t
 {
    DUPLICATEPOINT = 1,  // the cell is present on multiple processors
-   HIDDENPOINT = 2      // the point is needed to maintain connectivity, but the data values should be ignored.
+   HIDDENPOINT = 2      // the point is needed to maintain connectivity, but the data values should be ignored
 };
 
 /**
diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h
index 8ef4d2b7bc1c6f84090cf2d8f0dbd929bb4c9bda..2f332d20ee39fa89a2ec3f765c61f6eaf832f67d 100644
--- a/src/TNL/Meshes/Writers/PVTUWriter.h
+++ b/src/TNL/Meshes/Writers/PVTUWriter.h
@@ -31,7 +31,7 @@ public:
    PVTUWriter() = delete;
 
    PVTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {}
 
    // If desired, cycle and time of the simulation can put into the file. This follows the instructions at
@@ -65,9 +65,8 @@ public:
 
    // add all pieces and return the source path for the current rank
    // (useful for parallel writing)
-   template< typename Communicator >
    std::string addPiece( const String& mainFileName,
-                         const typename Communicator::CommunicationGroup group );
+                         const MPI_Comm group );
 
    ~PVTUWriter();
 
@@ -79,7 +78,7 @@ protected:
 
    void writeFooter();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
diff --git a/src/TNL/Meshes/Writers/PVTUWriter.hpp b/src/TNL/Meshes/Writers/PVTUWriter.hpp
index 71e19da1de2bcf4e77f22f31473eacd0d57340a9..affee65a289d2465150cb80a41aa396710939e90 100644
--- a/src/TNL/Meshes/Writers/PVTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/PVTUWriter.hpp
@@ -137,15 +137,14 @@ PVTUWriter< Mesh >::addPiece( const String& mainFileName,
 }
 
 template< typename Mesh >
-   template< typename Communicator >
 std::string
 PVTUWriter< Mesh >::addPiece( const String& mainFileName,
-                              const typename Communicator::CommunicationGroup group )
+                              const MPI_Comm group )
 {
    std::string source;
-   for( int i = 0; i < Communicator::GetSize( group ); i++ ) {
+   for( int i = 0; i < MPI::GetSize( group ); i++ ) {
       const std::string s = addPiece( mainFileName, i );
-      if( i == Communicator::GetRank( group ) )
+      if( i == MPI::GetRank( group ) )
          source = s;
    }
    return source;
diff --git a/src/TNL/Meshes/Writers/VTKWriter.h b/src/TNL/Meshes/Writers/VTKWriter.h
index e1c5fae9786c670dc43002efb3a65856a9d3b78c..db0c09b1306ce3524d2d088215f29919a0c4740d 100644
--- a/src/TNL/Meshes/Writers/VTKWriter.h
+++ b/src/TNL/Meshes/Writers/VTKWriter.h
@@ -45,7 +45,7 @@ public:
    VTKWriter() = delete;
 
    VTKWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::binary )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {
       if( format != VTK::FileFormat::ascii && format != VTK::FileFormat::binary )
          throw std::domain_error("The Legacy VTK file formats support only ASCII and BINARY formats.");
@@ -78,7 +78,7 @@ protected:
 
    void writeHeader();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
diff --git a/src/TNL/Meshes/Writers/VTKWriter.hpp b/src/TNL/Meshes/Writers/VTKWriter.hpp
index 125366d0334507ff9f50c765417077cc1f253978..801d3bc1926a944396fc19b9765e3d6bfb9ad841 100644
--- a/src/TNL/Meshes/Writers/VTKWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTKWriter.hpp
@@ -509,7 +509,7 @@ VTKWriter< Mesh >::writeDataArray( const Array& array,
    // use a host buffer if direct access to the array elements is not possible
    if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value )
    {
-      using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >;
+      using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >;
       HostArray hostBuffer;
       hostBuffer = array;
       writeDataArray( hostBuffer, name, numberOfComponents );
diff --git a/src/TNL/Meshes/Writers/VTUWriter.h b/src/TNL/Meshes/Writers/VTUWriter.h
index 9f715dce65af314acfa5eca21467c5d7eabcef84..00765cc0d14434379850d59d857a5cde463adfdb 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.h
+++ b/src/TNL/Meshes/Writers/VTUWriter.h
@@ -44,7 +44,7 @@ public:
    VTUWriter() = delete;
 
    VTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed )
-   : str(str), format(format)
+   : str(str.rdbuf()), format(format)
    {}
 
    // If desired, cycle and time of the simulation can put into the file. This follows the instructions at
@@ -78,7 +78,7 @@ protected:
 
    void writeFooter();
 
-   std::ostream& str;
+   std::ostream str;
 
    VTK::FileFormat format;
 
diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp
index 8d609f0a78a9327885751065f015b607c39f45e0..c8093010d6db57db63675d8adc0fcab002a997b4 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTUWriter.hpp
@@ -83,6 +83,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -94,7 +95,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
          connectivity.push_back( i );
          connectivity.push_back( i+1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -106,6 +107,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -116,7 +118,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -128,6 +130,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 2 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 2 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -142,7 +145,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -154,6 +157,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -161,21 +165,21 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
                      std::vector< std::uint8_t > & types )
    {
       for( MeshIndex j = 0; j < mesh.getDimensions().y(); j++ )
-      for( MeshIndex i = 0; i < ( mesh.getDimensions().x() + 1 ); i++ )
+      for( MeshIndex i = 0; i < (mesh.getDimensions().x() + 1); i++ )
       {
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
-      for( MeshIndex j = 0; j < (mesh.getDimensions().y()+1); j++ )
+      for( MeshIndex j = 0; j < (mesh.getDimensions().y() + 1); j++ )
       for( MeshIndex i = 0; i < mesh.getDimensions().x(); i++ )
       {
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -187,6 +191,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -198,7 +203,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( j * mesh.getDimensions().x() + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -210,6 +215,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 3 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -229,7 +235,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Voxel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -241,6 +247,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 2 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -256,7 +263,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ )
@@ -268,7 +275,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ )
@@ -280,7 +287,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Pixel );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -292,6 +299,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 1 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 1 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -305,7 +313,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ )
@@ -315,7 +323,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
 
       for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ )
@@ -325,7 +333,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Line );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -337,6 +345,7 @@ template< typename MeshReal,
 struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0 >
 {
    using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >;
+   using Entity = typename Mesh::template EntityType< 0 >;
 
    static void exec( const Mesh& mesh,
                      std::vector< typename Mesh::GlobalIndexType > & connectivity,
@@ -349,7 +358,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
       {
          connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i );
          offsets.push_back( connectivity.size() );
-         types.push_back( (std::uint8_t) VTK::EntityShape::Vertex );
+         types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape );
       }
    }
 };
@@ -459,7 +468,7 @@ VTUWriter< Mesh >::writeDataArray( const Array& array,
    // use a host buffer if direct access to the array elements is not possible
    if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value )
    {
-      using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >;
+      using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >;
       HostArray hostBuffer;
       hostBuffer = array;
       writeDataArray( hostBuffer, name, numberOfComponents );
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 1da61c51ef80c77b133ae9060ef410d1a4a00949..131697afb38443a8c5bbc2206c30a3953a578d27 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -146,7 +146,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
         if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
             Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(initialConditionFile, *uPointer );
         synchronizer.setDistributedGrid( uPointer->getMesh().getDistributedMesh() );
-        synchronizer.template synchronize<CommunicatorType>( *uPointer );
+        synchronizer.synchronize( *uPointer );
     }
     else
     {
@@ -173,7 +173,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-   template< typename MatrixPointer >          
+   template< typename MatrixPointer >
 bool
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 setupLinearSystem( MatrixPointer& matrixPointer )
@@ -247,7 +247,7 @@ getExplicitUpdate( const RealType& time,
     *
     * You may use supporting vectors again if you need.
     */
-   
+
    this->bindDofs( uDofs );
    this->fuPointer->bind( this->getMesh(), *fuDofs );
    this->explicitUpdater.template update< typename Mesh::Cell, Communicator >( time, tau, this->getMesh(), this->uPointer, this->fuPointer );
@@ -258,7 +258,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-void 
+void
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 applyBoundaryConditions( const RealType& time,
                          DofVectorPointer& uDofs )
@@ -272,7 +272,7 @@ template< typename Mesh,
           typename RightHandSide,
           typename Communicator,
           typename DifferentialOperator >
-    template< typename MatrixPointer > 
+    template< typename MatrixPointer >
 void
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >::
 assemblyLinearSystem( const RealType& time,
@@ -282,7 +282,7 @@ assemblyLinearSystem( const RealType& time,
                       DofVectorPointer& bPointer )
 {
    this->bindDofs( dofsPointer );
-   this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( 
+   this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >(
       time,
       tau,
       this->getMesh(),
diff --git a/src/TNL/Problems/PDEProblem_impl.h b/src/TNL/Problems/PDEProblem_impl.h
index 6a3aa63e6d82bce68b9f549b413d275504f137aa..f42f18b165887c4ad006bad75e5ac0bdc3beea98 100644
--- a/src/TNL/Problems/PDEProblem_impl.h
+++ b/src/TNL/Problems/PDEProblem_impl.h
@@ -59,7 +59,7 @@ template< typename Mesh,
 typename PDEProblem< Mesh, Communicator, Real, Device, Index >::IndexType
 PDEProblem< Mesh, Communicator, Real, Device, Index >::
 subdomainOverlapSize()
-{ 
+{
    return 1;
 }
 
@@ -77,9 +77,9 @@ getSubdomainOverlaps( const Config::ParameterContainer& parameters,
                       SubdomainOverlapsType& upper )
 {
    using namespace Meshes::DistributedMeshes;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() );
 }
-      
+
 template< typename Mesh,
           typename Communicator,
           typename Real,
diff --git a/src/TNL/Solvers/Linear/BICGStab.h b/src/TNL/Solvers/Linear/BICGStab.h
index 2cede824ad00c4ea8b4cb2f270d86882f5bfcfe3..474a45d023a579095bf22db383cdbee4feb43294 100644
--- a/src/TNL/Solvers/Linear/BICGStab.h
+++ b/src/TNL/Solvers/Linear/BICGStab.h
@@ -37,6 +37,10 @@ public:
    bool solve( ConstVectorViewType b, VectorViewType x ) override;
 
 protected:
+   void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b );
+
+   void preconditioned_matvec( ConstVectorViewType src, VectorViewType dst );
+
    void setSize( const VectorViewType& x );
 
    bool exact_residue = false;
diff --git a/src/TNL/Solvers/Linear/BICGStab_impl.h b/src/TNL/Solvers/Linear/BICGStab_impl.h
index baa4b6363e712ec4156e7a4bc79bc6e32bcc031c..ff3b42ed0c7b7cc65527481f8edd2990b94b639d 100644
--- a/src/TNL/Solvers/Linear/BICGStab_impl.h
+++ b/src/TNL/Solvers/Linear/BICGStab_impl.h
@@ -38,111 +38,80 @@ setup( const Config::ParameterContainer& parameters,
 }
 
 template< typename Matrix >
-bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x )
+bool
+BICGStab< Matrix >::
+solve( ConstVectorViewType b, VectorViewType x )
 {
    this->setSize( x );
 
-   RealType alpha, beta, omega, aux, rho, rho_old, b_norm;
+   RealType alpha, beta, omega, rho, rho_old, b_norm, r_ast_sqnorm;
 
+   // initialize the norm of the preconditioned right-hand-side
    if( this->preconditioner ) {
       this->preconditioner->solve( b, M_tmp );
       b_norm = lpNorm( M_tmp, 2.0 );
-
-      this->matrix->vectorProduct( x, M_tmp );
-      M_tmp = b - M_tmp;
-      this->preconditioner->solve( M_tmp, r );
    }
-   else {
+   else
       b_norm = lpNorm( b, 2.0 );
-      this->matrix->vectorProduct( x, r );
-      r = b - r;
-   }
+   if( b_norm == 0.0 )
+      b_norm = 1.0;
+
+   // r = M.solve(b - A * x);
+   compute_residue( r, x, b );
 
    p = r_ast = r;
    s.setValue( 0.0 );
-   rho = (r, r_ast);
+   r_ast_sqnorm = rho = (r, r_ast);
 
-   if( b_norm == 0.0 )
-       b_norm = 1.0;
+   const RealType eps2 = std::numeric_limits<RealType>::epsilon() * std::numeric_limits<RealType>::epsilon();
 
    this->resetIterations();
    this->setResidue( std::sqrt( rho ) / b_norm );
 
    while( this->nextIteration() )
    {
-      /****
-       * alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 )
-       */
-      if( this->preconditioner ) {
-         this->matrix->vectorProduct( p, M_tmp );
-         this->preconditioner->solve( M_tmp, Ap );
-      }
-      else {
-         this->matrix->vectorProduct( p, Ap );
-      }
-      aux = (Ap, r_ast);
-      alpha = rho / aux;
+      // alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 )
+      preconditioned_matvec( p, Ap );
+      alpha = rho / (Ap, r_ast);
 
-      /****
-       * s_j = r_j - alpha_j * A p_j
-       */
+      // s_j = r_j - alpha_j * A p_j
       s = r - alpha * Ap;
 
-      /****
-       * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
-       */
-      if( this->preconditioner ) {
-         this->matrix->vectorProduct( s, M_tmp );
-         this->preconditioner->solve( M_tmp, As );
-      }
-      else {
-         this->matrix->vectorProduct( s, As );
-      }
-      aux = lpNorm( As, 2.0 );
-      omega = (As, s) / (aux * aux);
+      // omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
+      preconditioned_matvec( s, As );
+      omega = (As, s) / (As, As);
 
-      /****
-       * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
-       */
+      // x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
       x += alpha * p + omega * s;
 
-      /****
-       * r_{j+1} = s_j - omega_j * A s_j
-       */
+      // r_{j+1} = s_j - omega_j * A s_j
       r = s - omega * As;
 
-      /****
-       * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
-       */
+      // compute scalar product of the residual vectors
       rho_old = rho;
       rho = (r, r_ast);
+      if( abs(rho) < eps2 * r_ast_sqnorm ) {
+         // The new residual vector has become too orthogonal to the arbitrarily chosen direction r_ast.
+         // Let's restart with a new r0:
+         compute_residue( r, x, b );
+         r_ast = r;
+         r_ast_sqnorm = rho = (r, r_ast);
+      }
+
+      // beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
       beta = (rho / rho_old) * (alpha / omega);
 
-      /****
-       * p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j )
-       */
+      // p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j )
       p = r + beta * p - (beta * omega) * Ap;
 
       if( exact_residue ) {
-         /****
-          * Compute the exact preconditioned residue into the 's' vector.
-          */
-         if( this->preconditioner ) {
-            this->matrix->vectorProduct( x, M_tmp );
-            M_tmp = b - M_tmp;
-            this->preconditioner->solve( M_tmp, s );
-         }
-         else {
-            this->matrix->vectorProduct( x, s );
-            s = b - s;
-         }
+         // Compute the exact preconditioned residue into the 's' vector.
+         compute_residue( s, x, b );
          const RealType residue = lpNorm( s, 2.0 );
          this->setResidue( residue / b_norm );
       }
       else {
-         /****
-          * Use the "orthogonal residue vector" for stopping.
-          */
+         // Use the "orthogonal residue vector" for stopping.
          const RealType residue = lpNorm( r, 2.0 );
          this->setResidue( residue / b_norm );
       }
@@ -153,7 +122,40 @@ bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x )
 }
 
 template< typename Matrix >
-void BICGStab< Matrix > :: setSize( const VectorViewType& x )
+void
+BICGStab< Matrix >::
+compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b )
+{
+   // r = M.solve(b - A * x);
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( x, M_tmp );
+      M_tmp = b - M_tmp;
+      this->preconditioner->solve( M_tmp, r );
+   }
+   else {
+      this->matrix->vectorProduct( x, r );
+      r = b - r;
+   }
+}
+
+template< typename Matrix >
+void
+BICGStab< Matrix >::
+preconditioned_matvec( ConstVectorViewType src, VectorViewType dst )
+{
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( src, M_tmp );
+      this->preconditioner->solve( M_tmp, dst );
+   }
+   else {
+      this->matrix->vectorProduct( src, dst );
+   }
+}
+
+template< typename Matrix >
+void
+BICGStab< Matrix >::
+setSize( const VectorViewType& x )
 {
    r.setLike( x );
    r_ast.setLike( x );
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index e1c02f0ab5eadbaea7ba2c2678641b0c1a05ee6b..818f1c163019a3f83347c2f8a0ca0ce1c518a667 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -23,10 +23,7 @@ class GMRES
 : public LinearSolver< Matrix >
 {
    using Base = LinearSolver< Matrix >;
-
-   // compatibility shortcuts
    using Traits = Linear::Traits< Matrix >;
-   using CommunicatorType = typename Traits::CommunicatorType;
 
 public:
    using RealType = typename Base::RealType;
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 02a122a5dd178cb7100edd52210004dccddf2626..3b13e0b28ed4c6f17bea1b9c582fbdcfd6edc961 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -477,20 +477,20 @@ hauseholder_generate( const int i,
                       ConstVectorViewType z )
 {
    // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
+   ConstDeviceView z_local = Traits::getConstLocalView( z );
+   DeviceView y_i_local = Traits::getLocalView( y_i );
    if( localOffset == 0 ) {
       TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" );
       auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable
       {
          if( j < i )
-            y_i[ j ] = 0.0;
+            y_i_local[ j ] = 0.0;
          else
-            y_i[ j ] = z[ j ];
+            y_i_local[ j ] = z_local[ j ];
       };
       Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation );
    }
    else {
-      ConstDeviceView z_local = Traits::getConstLocalView( z );
-      DeviceView y_i_local = Traits::getLocalView( y_i );
       y_i_local = z_local;
    }
 
@@ -510,7 +510,7 @@ hauseholder_generate( const int i,
       norm_yi_squared = 2 * (normz * normz + std::fabs( y_ii ) * normz);
    }
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 
    // XXX: normalization is slower, but more stable
 //   y_i *= 1.0 / std::sqrt( norm_yi_squared );
@@ -534,7 +534,7 @@ hauseholder_generate( const int i,
                  i,
                  aux );
       // no-op if the problem is not distributed
-      CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
+      MPI::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
       // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
       for( int k = 0; k < i; k++ ) {
@@ -559,7 +559,7 @@ hauseholder_apply_trunc( HostView out,
    HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 );
    Algorithms::MultiDeviceMemoryOperations< Devices::Host, DeviceType >::copy( YL_i.getData(), Traits::getLocalView( y_i ).getData(), YL_i.getSize() );
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) );
 
    // NOTE: aux = t_i * (y_i, z) = 1  since  t_i = 2 / ||y_i||^2  and
    //       (y_i, z) = ||z_trunc||^2 + |z_i| ||z_trunc|| = ||y_i||^2 / 2
@@ -579,7 +579,7 @@ hauseholder_apply_trunc( HostView out,
    }
 
    // no-op if the problem is not distributed
-   CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 }
 
 template< typename Matrix >
@@ -634,7 +634,7 @@ hauseholder_cwy_transposed( VectorViewType z,
               i + 1,
               aux );
    // no-op if the problem is not distributed
-   Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
+   MPI::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
    // aux = T_i^T * aux
    // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
index f88e315ccf734a12ec20e53fb930016aa0330b36..7c03dd7ce6bf073a1a675798eb19f677d8339520 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
@@ -42,12 +42,12 @@ protected:
    VectorType diagonal;
 };
 
-template< typename Matrix, typename Communicator >
-class Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >
-: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+class Diagonal< Matrices::DistributedMatrix< Matrix > >
+: public Preconditioner< Matrices::DistributedMatrix< Matrix > >
 {
 public:
-   using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >;
+   using MatrixType = Matrices::DistributedMatrix< Matrix >;
    using RealType = typename MatrixType::RealType;
    using DeviceType = typename MatrixType::DeviceType;
    using IndexType = typename MatrixType::IndexType;
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index 788fc228d0d226db0e53e507c331859483ab7f69..17746373a338fc885d2ab46c9743448811fac857 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -49,57 +49,58 @@ void
 Diagonal< Matrix >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
-   ConstVectorViewType diag_view( diagonal );
-
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      x[ i ] = b[ i ] / diag_view[ i ];
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   x = b / diagonal;
 }
 
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 void
-Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >::
+Diagonal< Matrices::DistributedMatrix< Matrix > >::
 update( const MatrixPointer& matrixPointer )
 {
    TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" );
-   TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" );
-
    diagonal.setSize( matrixPointer->getLocalMatrix().getRows() );
 
    LocalViewType diag_view( diagonal );
-   const MatrixType* kernel_matrix = &matrixPointer.template getData< DeviceType >();
-
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i );
-      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi );
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   // FIXME: SparseMatrix::getConstView is broken
+//   const auto matrix_view = matrixPointer->getLocalMatrix().getConstView();
+   const auto matrix_view = matrixPointer->getLocalMatrix().getView();
+
+   if( matrixPointer->getRows() == matrixPointer->getColumns() ) {
+      // square matrix, assume global column indices
+      const auto row_range = matrixPointer->getLocalRowRange();
+      auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+      {
+         const IndexType gi = row_range.getGlobalIndex( i );
+         diag_view[ i ] = matrix_view.getElement( i, gi );
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   }
+   else {
+      // non-square matrix, assume ghost indexing
+      TNL_ASSERT_LT( matrixPointer->getLocalMatrix().getRows(), matrixPointer->getLocalMatrix().getColumns(), "the local matrix should have more columns than rows" );
+      auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+      {
+         diag_view[ i ] = matrix_view.getElement( i, i );
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   }
 }
 
-template< typename Matrix, typename Communicator >
+template< typename Matrix >
 void
-Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >::
+Diagonal< Matrices::DistributedMatrix< Matrix > >::
 solve( ConstVectorViewType b, VectorViewType x ) const
 {
    ConstLocalViewType diag_view( diagonal );
    const auto b_view = b.getConstLocalView();
    auto x_view = x.getLocalView();
 
-   TNL_ASSERT_EQ( b_view.getSize(), diagonal.getSize(), "The size of the vector b does not match the size of the extracted diagonal." );
-   TNL_ASSERT_EQ( x_view.getSize(), diagonal.getSize(), "The size of the vector x does not match the size of the extracted diagonal." );
+   // compute without ghosts (diagonal includes only local rows)
+   x_view = b_view / diag_view;
 
-   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
-   {
-      x_view[ i ] = b_view[ i ] / diag_view[ i ];
-   };
-
-   Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
+   // synchronize ghosts
+   x.startSynchronization();
 }
 
 } // namespace Preconditioners
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index c4b409bb3a047e12410066bddbe9bdcce509ee89..a4eb9e8aae26786412fe8945a9ccf2795f6293fa 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -90,7 +90,12 @@ protected:
    template< typename M >
    static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m )
    {
-      return m.getLocalRowRange().getBegin();
+      if( m.getRows() == m.getColumns() )
+         // square matrix, assume global column indices
+         return m.getLocalRowRange().getBegin();
+      else
+         // non-square matrix, assume ghost indexing
+         return 0;
    }
 };
 
@@ -189,11 +194,11 @@ protected:
 #endif
 };
 
-template< typename Matrix, typename Communicator >
-class ILU0_impl< Matrices::DistributedMatrix< Matrix, Communicator >, double, Devices::Cuda, int >
-: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+class ILU0_impl< Matrices::DistributedMatrix< Matrix >, double, Devices::Cuda, int >
+: public Preconditioner< Matrices::DistributedMatrix< Matrix > >
 {
-   using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >;
+   using MatrixType = Matrices::DistributedMatrix< Matrix >;
 public:
    using RealType = double;
    using DeviceType = Devices::Cuda;
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index c11909c073e9ec10b5310bebd2c4a20bbfbee5dc..f68a93f16c21c2a96ce1ed55132f021dc573b068 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -145,6 +145,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const
 
    // Step 2: solve x from Ux = y
    triangularSolveUpper< true, true >( U, x, x );
+
+   // synchronize ghosts
+   Traits< Matrix >::startSynchronization( _x );
 }
 
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index d46f3f900f4357dd7bf15dce170e5d63ecf22497..344daf1a0103a0a93ca576358b2da787d7578f8b 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -79,7 +79,12 @@ protected:
    template< typename M >
    static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m )
    {
-      return m.getLocalRowRange().getBegin();
+      if( m.getRows() == m.getColumns() )
+         // square matrix, assume global column indices
+         return m.getLocalRowRange().getBegin();
+      else
+         // non-square matrix, assume ghost indexing
+         return 0;
    }
 };
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index c9c2a0b7768d1a0da4ba11460a2e0fa0c67eb068..21b895c48a2074b78b54d3eea11a301549f1afa6 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -272,6 +272,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const
 
    // Step 2: solve x from Ux = y
    triangularSolveUpper< true, false >( U, x, x );
+
+   // synchronize ghosts
+   Traits< Matrix >::startSynchronization( _x );
 }
 
 } // namespace Preconditioners
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 9a5db2c40297aeba9713482603fc8fcb15d6793e..d98b78294cd148584a51fa037a5763df8c9ebc3e 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -26,8 +26,6 @@ namespace Linear {
 template< typename Matrix >
 struct Traits
 {
-   using CommunicatorType = Communicators::NoDistrCommunicator;
-
    using VectorType = Containers::Vector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
@@ -51,29 +49,26 @@ struct Traits
    static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v; }
    static LocalViewType getLocalView( VectorViewType v ) { return v; }
 
-   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; }
+   static MPI_Comm getCommunicationGroup( const Matrix& m ) { return MPI::AllGroup(); }
+   static void startSynchronization( VectorViewType v ) {}
+   static void waitForSynchronization( VectorViewType v ) {}
 };
 
-template< typename Matrix, typename Communicator >
-struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
+template< typename Matrix >
+struct Traits< Matrices::DistributedMatrix< Matrix > >
 {
-   using CommunicatorType = Communicator;
-
    using VectorType = Containers::DistributedVector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
    using VectorViewType = Containers::DistributedVectorView
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
    using ConstVectorViewType = Containers::DistributedVectorView
          < std::add_const_t< typename Matrix::RealType >,
            typename Matrix::DeviceType,
-           typename Matrix::IndexType,
-           Communicator >;
+           typename Matrix::IndexType >;
 
    using LocalVectorType = Containers::Vector
          < typename Matrix::RealType,
@@ -89,12 +84,13 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > >
            typename Matrix::IndexType >;
 
    // compatibility wrappers for some DistributedMatrix methods
-   static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix, Communicator >& m )
-   { return m.getLocalMatrix(); }
+   static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getLocalMatrix(); }
    static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); }
    static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); }
 
-   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
+   static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getCommunicationGroup(); }
+   static void startSynchronization( VectorViewType v ) { v.startSynchronization(); }
+   static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); }
 };
 
 } // namespace Linear
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 4c7b21bc93c5bcfb5adff76e89d876778f1049aa..247318f330d6154a419aa235f49cb74d22afdba5 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -13,14 +13,13 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Config/ParameterContainer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/MPI/Wrappers.h>
 
 #include "Merson.h"
 
 namespace TNL {
 namespace Solvers {
-namespace ODE {   
+namespace ODE {
 
 /****
  * In this code we do not use constants and references as we would like to.
@@ -155,9 +154,9 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
       RealType error( 0.0 );
       if( adaptivity != 0.0 )
       {
-         const RealType localError = 
+         const RealType localError =
             max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) );
-            Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+            MPI::Allreduce( &localError, &error, 1, MPI_MAX, MPI::AllGroup() );
       }
 
       if( adaptivity == 0.0 || error < adaptivity )
@@ -186,7 +185,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
          TNLMPI::Bcast( currentTau, 1, 0 );
-#endif        
+#endif
       }
       if( time + currentTau > this->getStopTime() )
          currentTau = this->getStopTime() - time; //we don't want to keep such tau
diff --git a/src/TNL/Solvers/PDE/PDESolver.h b/src/TNL/Solvers/PDE/PDESolver.h
index b9bbcd5e2e3ba89d0a424611ea3547008f5f9632..70f19d8de9e58fcb8ae3c141eeecd7ea95e3e862 100644
--- a/src/TNL/Solvers/PDE/PDESolver.h
+++ b/src/TNL/Solvers/PDE/PDESolver.h
@@ -18,8 +18,8 @@
 
 namespace TNL {
 namespace Solvers {
-namespace PDE { 
-   
+namespace PDE {
+
 template< typename Real,
           typename Index >
 class PDESolver
@@ -28,8 +28,8 @@ class PDESolver
       using RealType = Real;
       using IndexType = Index;
       using SolverMonitorType = IterativeSolverMonitor< RealType, IndexType >;
-      
-      
+
+
       PDESolver();
 
       static void configSetup( Config::ConfigDescription& config,
@@ -38,29 +38,28 @@ class PDESolver
       bool setup( const Config::ParameterContainer& parameters,
                   const String& prefix = "" );
 
-      template< typename Communicator >
       bool writeProlog( Logger& logger,
                         const Config::ParameterContainer& parameters );
-      
+
       void setIoTimer( Timer& ioTimer);
 
       void setComputeTimer( Timer& computeTimer );
-      
+
       void setTotalTimer( Timer& totalTimer );
-      
+
       void setSolverMonitor( SolverMonitorType& solverMonitor );
-      
+
       SolverMonitorType& getSolverMonitor();
 
-      bool writeEpilog( Logger& logger ) const;      
-      
+      bool writeEpilog( Logger& logger ) const;
+
    protected:
 
       Timer *ioTimer, *computeTimer, *totalTimer;
-      
+
       SolverMonitorType *solverMonitorPointer;
 };
- 
+
 } // namespace PDE
 } // namespace Solvers
 } // namespace TNL
diff --git a/src/TNL/Solvers/PDE/PDESolver_impl.h b/src/TNL/Solvers/PDE/PDESolver_impl.h
index 37ade9f38e74427903b1c624a383544179839bed..8bdcbd86ab3905dc08227370580f034d53abc612 100644
--- a/src/TNL/Solvers/PDE/PDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/PDESolver_impl.h
@@ -11,21 +11,22 @@
 #pragma once
 
 #include <TNL/Solvers/PDE/PDESolver.h>
+#include <TNL/MPI/Utils.h>
 
 namespace TNL {
 namespace Solvers {
-namespace PDE { 
+namespace PDE {
 
 template< typename Real,
-          typename Index >   
-PDESolver< Real, Index >::PDESolver()   
+          typename Index >
+PDESolver< Real, Index >::PDESolver()
 : ioTimer( 0 ),
   computeTimer( 0 ),
   totalTimer( 0 ),
   solverMonitorPointer( 0 )
 {
 }
-   
+
 template< typename Real,
           typename Index >
 void
@@ -65,7 +66,6 @@ getSolverMonitor()
 
 template< typename Real,
           typename Index >
-   template< typename Communicator >
 bool
 PDESolver< Real, Index >::
 writeProlog( Logger& logger,
@@ -84,7 +84,8 @@ writeProlog( Logger& logger,
       else
          logger.writeParameter< String >( "OMP enabled:", "no", 1 );
    }
-   Communicator::writeProlog( logger );
+   if( MPI::isInitialized() )
+      logger.writeParameter( "MPI processes:", MPI::GetSize() );
    logger.writeSeparator();
    const bool printGPUs = parameters.getParameter< String >( "device" ) == "cuda";
    logger.writeSystemInformation( printGPUs );
@@ -116,9 +117,9 @@ void PDESolver< Real, Index >::
 setTotalTimer( Timer& totalTimer )
 {
    this->totalTimer = &totalTimer;
-}  
-   
+}
+
 } // namespace PDE
 } // namespace Solvers
 } // namespace TNL
-   
+
diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
index 46ffa6fea83ac96ab8e987da149516ac0f6f7213..34f2798f8d6ff70196ed0c7e375eab63f371eb19 100644
--- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h
@@ -63,7 +63,7 @@ setup( const Config::ParameterContainer& parameters,
    const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" );
    this->distributedMesh.setup( parameters, prefix );
    if( Problem::CommunicatorType::isDistributed() ) {
-      if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
+      if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
          return false;
       if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) )
          return false;
@@ -165,7 +165,7 @@ writeProlog( Logger& logger,
    logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters );
    logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters );
    logger.writeSeparator();
-   return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters );
+   return BaseType::writeProlog( logger, parameters );
 }
 
 template< typename Problem,
diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
index 455682e2b6498f13061f8a86c005c2402c44c7da..880d0ab31de1fcf6557ab40a5d2bbb1fbe3f0cd3 100644
--- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h
@@ -15,7 +15,7 @@
  *                                                                         *
  ***************************************************************************/
 
-#pragma once 
+#pragma once
 
 #include <TNL/Solvers/PDE/TimeIndependentPDESolver.h>
 #include <TNL/Meshes/TypeResolver/TypeResolver.h>
@@ -23,7 +23,7 @@
 
 namespace TNL {
 namespace Solvers {
-namespace PDE {   
+namespace PDE {
 
 
 template< typename Problem >
@@ -54,7 +54,7 @@ setup( const Config::ParameterContainer& parameters,
    const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" );
    this->distributedMesh.setup( parameters, prefix );
    if( Problem::CommunicatorType::isDistributed() ) {
-      if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
+      if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) )
          return false;
       if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) )
          return false;
@@ -75,7 +75,7 @@ setup( const Config::ParameterContainer& parameters,
       return false;
    }
    problem->setCommonData( this->commonDataPointer );
-   
+
    /****
     * Setup the problem
     */
@@ -83,7 +83,7 @@ setup( const Config::ParameterContainer& parameters,
    {
       std::cerr << "The problem initiation failed!" << std::endl;
       return false;
-   }   
+   }
 
    /****
     * Set DOFs (degrees of freedom)
@@ -91,9 +91,9 @@ setup( const Config::ParameterContainer& parameters,
    TNL_ASSERT_GT( problem->getDofs(), 0, "number of DOFs must be positive" );
    this->dofs->setSize( problem->getDofs() );
    this->dofs->setValue( 0.0 );
-   this->problem->bindDofs( this->dofs );   
-   
-   
+   this->problem->bindDofs( this->dofs );
+
+
    /***
     * Set-up the initial condition
     */
@@ -102,7 +102,7 @@ setup( const Config::ParameterContainer& parameters,
    if( ! this->problem->setInitialCondition( parameters, this->dofs ) )
       return false;
    std::cout << " [ OK ]" << std::endl;
-   
+
    return true;
 }
 
@@ -128,7 +128,7 @@ writeProlog( Logger& logger,
    logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters );
    logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters );
    logger.writeSeparator();
-   return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters );
+   return BaseType::writeProlog( logger, parameters );
 }
 
 template< typename Problem >
diff --git a/src/TNL/Solvers/SolverInitiator.h b/src/TNL/Solvers/SolverInitiator.h
index 0ba4dc55a9bbad7354dfe296cecb2c10074da3ff..06285752054fdd675e449e68cbc2506ac48a6565 100644
--- a/src/TNL/Solvers/SolverInitiator.h
+++ b/src/TNL/Solvers/SolverInitiator.h
@@ -16,7 +16,7 @@
 namespace TNL {
 namespace Solvers {
 
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter , typename CommunicatorType  > class ProblemSetter,
+template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
           typename ConfigTag >
 class SolverInitiator
 {
diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h
index 16e0fd2227830a5687e8ee9a04a878d723efd8d3..3d704426dd2715d18355b6a1329c6333efaedf73 100644
--- a/src/TNL/Solvers/SolverInitiator_impl.h
+++ b/src/TNL/Solvers/SolverInitiator_impl.h
@@ -18,7 +18,6 @@
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Meshes/DummyMesh.h>
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
 namespace TNL {
@@ -50,15 +49,6 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Device,
           typename Index,
           typename ConfigTag,
-          bool enabled = true  >
-class CommunicatorTypeResolver {};
-
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename ConfigTag,
-          typename CommunicatorType,
           bool enabled = ConfigTagMeshResolve< ConfigTag >::enabled >
 class SolverInitiatorMeshResolver {};
 
@@ -169,7 +159,7 @@ class SolverInitiatorIndexResolver< ProblemSetter, Real, Device, Index, ConfigTa
    public:
       static bool run( const Config::ParameterContainer& parameters )
       {
-         return CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::run( parameters );
+         return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag >::run( parameters );
       }
 };
 
@@ -178,28 +168,12 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Device,
           typename Index,
           typename ConfigTag >
-class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
-{
-   public:
-      static bool run( const Config::ParameterContainer& parameters )
-      {
-         if( Communicators::MpiCommunicator::isDistributed() )
-            return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters );
-         return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters );
-      }
-};
-
-template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
-          typename Real,
-          typename Device,
-          typename Index,
-          typename ConfigTag,
-          typename CommunicatorType >
-class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, CommunicatorType, false >
+class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, false >
 {
    public:
       static bool run( const Config::ParameterContainer& parameters )
       {
+         using CommunicatorType = Communicators::MpiCommunicator;
          return ProblemSetter< Real,
                                Device,
                                Index,
@@ -213,10 +187,11 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Real,
           typename Device,
           typename Index,
-          typename ConfigTag,
-          typename CommunicatorType >
-class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag,CommunicatorType, true >
+          typename ConfigTag >
+class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
 {
+   using CommunicatorType = Communicators::MpiCommunicator;
+
    // wrapper for MeshTypeResolver
    template< typename MeshType >
    using ProblemSetterWrapper = ProblemSetter< Real, Device, Index, MeshType, ConfigTag, SolverStarter< ConfigTag >, CommunicatorType >;
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index d2bbd81594658ca60b74426e6b4bce8d0b68f74c..dbecdaad98dc62f9a633a0f629ec9b246ce00df8 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -14,8 +14,7 @@
 #include <TNL/String.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Devices/Host.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/MPI/Config.h>
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Solvers/BuildConfigTags.h>
 #include <TNL/Solvers/ODE/Merson.h>
@@ -25,14 +24,14 @@
 #include <TNL/Solvers/PDE/PDESolverTypeResolver.h>
 
 namespace TNL {
-namespace Solvers {   
+namespace Solvers {
 
 template< typename Problem,
           typename ConfigTag,
           bool TimeDependent = Problem::isTimeDependent() >
 class TimeDependencyResolver
 {};
-   
+
 template< typename Problem,
           typename ConfigTag,
           typename TimeStepper = typename Problem::TimeStepper >
@@ -66,8 +65,7 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame
     */
    if( ! Devices::Host::setup( parameters ) ||
        ! Devices::Cuda::setup( parameters ) ||
-       ! Communicators::NoDistrCommunicator::setup( parameters ) ||
-       ! Communicators::MpiCommunicator::setup( parameters ) 
+       ! MPI::setup( parameters )
     )
       return false;
    Problem problem;
@@ -95,7 +93,7 @@ class TimeDependencyResolver< Problem, ConfigTag, false >
                        const Config::ParameterContainer& parameters )
       {
          // TODO: This should be improved - at least rename to LinearSolverSetter
-         return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters );   
+         return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters );
       }
 };
 
@@ -338,7 +336,7 @@ bool SolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
     */
    this->computeTimer.reset();
    this->ioTimer.reset();
-   
+
    /****
     * Create solver monitor thread
     */
diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h
index 9182c620fe2ed589389ab5a64328917f392795c7..bc1f43c7779e4af5d48063f3e3794fc3e4a1cd06 100644
--- a/src/TNL/Solvers/Solver_impl.h
+++ b/src/TNL/Solvers/Solver_impl.h
@@ -15,12 +15,12 @@
 #include <TNL/Solvers/SolverConfig.h>
 #include <TNL/Config/parseCommandLine.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 namespace TNL {
 namespace Solvers {
-   
+
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename MeshConfig, typename SolverStarter, typename CommunicatorType > class ProblemSetter,
           template< typename MeshConfig > class ProblemConfig,
           typename MeshConfig >
@@ -35,9 +35,9 @@ run( int argc, char* argv[] )
    configDescription.addDelimiter( "Parallelization setup:" );
    Devices::Host::configSetup( configDescription );
    Devices::Cuda::configSetup( configDescription );
-   Communicators::MpiCommunicator::configSetup( configDescription );
+   MPI::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv );
+   TNL::MPI::ScopedInitializer mpi( argc, argv );
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return false;
diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h
index 2afda7aad4522e6a62eb42838c8a23deca9d6b4b..63b8fc27391ebf9682c4a6a0e7a022a81caa2599 100644
--- a/src/TNL/TypeTraits.h
+++ b/src/TNL/TypeTraits.h
@@ -253,4 +253,21 @@ public:
     static constexpr bool value = type::value;
 };
 
+/**
+ * \brief Type trait for checking if T has getCommunicationGroup method.
+ */
+template< typename T >
+class HasGetCommunicationGroupMethod
+{
+private:
+    typedef char YesType[1];
+    typedef char NoType[2];
+
+    template< typename C > static YesType& test( decltype(std::declval< C >().getCommunicationGroup()) );
+    template< typename C > static NoType& test(...);
+
+public:
+    static constexpr bool value = ( sizeof( test< std::decay_t<T> >(0) ) == sizeof( YesType ) );
+};
+
 } //namespace TNL
diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp
index c33ae829439885aa0e69ed2efde610796d4d0ed2..7003489ab287dd733fa71761c0bc44302eb478d3 100644
--- a/src/Tools/tnl-game-of-life.cpp
+++ b/src/Tools/tnl-game-of-life.cpp
@@ -17,13 +17,11 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 using namespace TNL;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 struct MyConfigTag {};
 
 namespace TNL {
@@ -198,8 +196,8 @@ bool runGameOfLife( const Mesh& mesh )
       }
    }
    Index max_count;
-   CommunicatorType::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() );
-   std::cout << "Rank " << CommunicatorType::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl;
+   TNL::MPI::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() );
+   std::cout << "Rank " << TNL::MPI::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl;
    Index reference_cell = 0;
    if( count == max_count ) {
       // find cell which has all points in the central box
@@ -256,7 +254,7 @@ bool runGameOfLife( const Mesh& mesh )
       // create a .pvtu file (only rank 0 actually writes to the file)
       const std::string mainFilePath = "GoL." + std::to_string(iteration) + ".pvtu";
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -266,7 +264,7 @@ bool runGameOfLife( const Mesh& mesh )
       if( mesh.getGhostLevels() > 0 )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
       pvtu.template writePCellData< Real >( "function values" );
-      const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -292,7 +290,7 @@ bool runGameOfLife( const Mesh& mesh )
    Index iteration = 0;
    do {
       iteration++;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          std::cout << "Computing iteration " << iteration << "..." << std::endl;
 
       // iterate over all local entities
@@ -338,7 +336,7 @@ bool runGameOfLife( const Mesh& mesh )
 
       // check if finished
       const bool done = max( f_in.getData() ) == 0 || iteration > max_iter || f_in.getData() == f_out.getData();
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
    }
    while( all_done == false );
 
@@ -351,7 +349,7 @@ void configSetup( Config::ConfigDescription& config )
    config.addRequiredEntry< String >( "input-file", "Input file with the mesh." );
    config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" );
    config.addDelimiter( "MPI settings:" );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int main( int argc, char* argv[] )
@@ -361,12 +359,12 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
-   if( ! CommunicatorType::setup( parameters ) )
+   if( ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String inputFileName = parameters.getParameter< String >( "input-file" );
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index 1a7769b5c89911018127c0fb4e105f3931f6a7a1..a1b3a8ff33c259562bb23d779553d23b11368a1a 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -15,8 +15,8 @@
 #include <TNL/Functions/TestFunction.h>
 #include <TNL/Meshes/Grid.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 
 using namespace TNL;
@@ -53,9 +53,9 @@ int main( int argc, char* argv[] )
    Config::ConfigDescription configDescription;
 
    setupConfig( configDescription );
-   Communicators::MpiCommunicator::configSetup( configDescription );
+   TNL::MPI::configSetup( configDescription );
 
-   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return EXIT_FAILURE;
diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h
index a0d171f14b4f102a56d862fb283d3bb4437e680a..e78db1153b9bb52001bba79c182f1401f54307b8 100644
--- a/src/Tools/tnl-init.h
+++ b/src/Tools/tnl-init.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/MPI/Wrappers.h>
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Functions/TestFunction.h>
@@ -21,37 +22,32 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-
 using namespace TNL;
 
 template< typename MeshType,
           typename RealType,
-          typename CommunicatorType,
           int xDiff,
           int yDiff,
           int zDiff >
 bool renderFunction( const Config::ParameterContainer& parameters )
 {
-
    using namespace  Meshes::DistributedMeshes;
    using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh<MeshType>;
    DistributedGridType distributedMesh;
    Pointers::SharedPointer< MeshType > meshPointer;
    MeshType globalMesh;
 
-   if(CommunicatorType::isDistributed())
+   if(TNL::MPI::GetSize() > 1)
    {
        //suppose global mesh loaded from single file
        String meshFile = parameters.getParameter< String >( "mesh" );
        std::cout << "+ -> Loading mesh from " << meshFile << " ... " << std::endl;
        globalMesh.load( meshFile );
-   
+
        // TODO: This should work with no overlaps
-       distributedMesh.template setGlobalGrid<CommunicatorType>(globalMesh);
+       distributedMesh.setGlobalGrid(globalMesh);
        typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-       SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 );
+       SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 );
        distributedMesh.setOverlaps( lowerOverlap, upperOverlap );
        distributedMesh.setupGrid(*meshPointer);
     }
@@ -74,7 +70,7 @@ bool renderFunction( const Config::ParameterContainer& parameters )
    MeshFunctionPointer meshFunction( meshPointer );
    //if( ! discreteFunction.setSize( mesh.template getEntitiesCount< typename MeshType::Cell >() ) )
    //   return false;
- 
+
    double finalTime = parameters.getParameter< double >( "final-time" );
    double initialTime = parameters.getParameter< double >( "initial-time" );
    double tau = parameters.getParameter< double >( "snapshot-period" );
@@ -116,7 +112,7 @@ bool renderFunction( const Config::ParameterContainer& parameters )
       else
         std::cout << "+ -> Writing the function to " << outputFile << " ... " << std::endl;
 
-      if(CommunicatorType::isDistributed())
+      if(TNL::MPI::GetSize() > 1)
       {
          if( ! Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType> ::save(outputFile, *meshFunction ) )
             return false;
@@ -130,20 +126,6 @@ bool renderFunction( const Config::ParameterContainer& parameters )
    return true;
 }
 
-template< typename MeshType,
-          typename RealType,
-          int xDiff,
-          int yDiff,
-          int zDiff >
-bool resolveCommunicator( const Config::ParameterContainer& parameters )
-{
-#ifdef HAVE_MPI
-   if( Communicators::MpiCommunicator::isDistributed() )
-      return renderFunction<MeshType,RealType, Communicators::MpiCommunicator,xDiff,yDiff,zDiff>(parameters);
-#endif
-   return renderFunction<MeshType,RealType, Communicators::NoDistrCommunicator,xDiff,yDiff,zDiff>(parameters);
-}
-
 template< typename MeshType,
           typename RealType >
 bool resolveDerivatives( const Config::ParameterContainer& parameters )
@@ -160,75 +142,75 @@ bool resolveDerivatives( const Config::ParameterContainer& parameters )
       return false;
    }
    if( xDiff == 0 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 0 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 1 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 2 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 3 >( parameters );
    if( xDiff == 0 && yDiff == 0 && zDiff == 4 )
-      return resolveCommunicator< MeshType, RealType, 0, 0, 4 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 0, 4 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 0 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 1 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 2 >( parameters );
    if( xDiff == 0 && yDiff == 1 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 0, 1, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 1, 3 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 0 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 1 >( parameters );
    if( xDiff == 0 && yDiff == 2 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 0, 2, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 2, 2 >( parameters );
    if( xDiff == 0 && yDiff == 3 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 3, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 3, 0 >( parameters );
    if( xDiff == 0 && yDiff == 3 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 0, 3, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 3, 1 >( parameters );
    if( xDiff == 0 && yDiff == 4 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 0, 4, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 0, 4, 0 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 0 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 1 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 2 >( parameters );
    if( xDiff == 1 && yDiff == 0 && zDiff == 3 )
-      return resolveCommunicator< MeshType, RealType, 1, 0, 3 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 0, 3 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 0 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 1 >( parameters );
    if( xDiff == 1 && yDiff == 1 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 1, 1, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 1, 2 >( parameters );
    if( xDiff == 1 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 2, 0 >( parameters );
    if( xDiff == 1 && yDiff == 2 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 1, 2, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 2, 1 >( parameters );
    if( xDiff == 1 && yDiff == 3 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 1, 3, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 1, 3, 0 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 0 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 1 >( parameters );
    if( xDiff == 2 && yDiff == 0 && zDiff == 2 )
-      return resolveCommunicator< MeshType, RealType, 2, 0, 2 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 0, 2 >( parameters );
    if( xDiff == 2 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 1, 0 >( parameters );
    if( xDiff == 2 && yDiff == 1 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 2, 1, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 1, 1 >( parameters );
    if( xDiff == 2 && yDiff == 2 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 2, 2, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 2, 2, 0 >( parameters );
    if( xDiff == 3 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 3, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 0, 0 >( parameters );
    if( xDiff == 3 && yDiff == 0 && zDiff == 1 )
-      return resolveCommunicator< MeshType, RealType, 3, 0, 1 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 0, 1 >( parameters );
    if( xDiff == 3 && yDiff == 1 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 3, 1, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 3, 1, 0 >( parameters );
    if( xDiff == 4 && yDiff == 0 && zDiff == 0 )
-      return resolveCommunicator< MeshType, RealType, 4, 0, 0 >( parameters );
+      return renderFunction< MeshType, RealType, 4, 0, 0 >( parameters );
    return false;
 }
 
diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h
index 0be53242b8122f4173a11d27181a73232067b125..6b748d99355375e0f0bb4ac20eaf54129a07ad2f 100644
--- a/src/Tools/tnl-test-distributed-mesh.h
+++ b/src/Tools/tnl-test-distributed-mesh.h
@@ -18,13 +18,11 @@
 #include <TNL/Meshes/Geometry/getEntityCenter.h>
 #include <TNL/Meshes/Writers/VTUWriter.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Config.h>
 
 using namespace TNL;
 
-using CommunicatorType = Communicators::MpiCommunicator;
-
 struct MyConfigTag {};
 
 namespace TNL {
@@ -214,7 +212,7 @@ void testSynchronizerOnDevice( const MeshType& mesh )
          if( received != center ) {
             IndexType cellIndexes[ 2 ] = {0, 0};
             const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes );
-            std::cerr << "rank " << CommunicatorType::GetRank()
+            std::cerr << "rank " << TNL::MPI::GetRank()
                       << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")"
                       << " of dimension = " << EntityType::getEntityDimension()
                       << ": received " << received << ", expected = " << center
@@ -224,7 +222,7 @@ void testSynchronizerOnDevice( const MeshType& mesh )
          }
       }
    if( errors > 0 ) {
-      std::cerr << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl;
+      std::cerr << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl;
       TNL_ASSERT_TRUE( false, "test failed" );
    }
 }
@@ -273,7 +271,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
       // create a .pvtu file (only rank 0 actually writes to the file)
       const std::string mainFilePath = "data_" + std::to_string(iteration) + ".pvtu";
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -284,7 +282,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
       pvtu.template writePCellData< Real >( "function values" );
       pvtu.template writePCellData< Real >( "test values" );
-      const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -315,7 +313,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
    int iteration = 0;
    do {
       iteration++;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          std::cout << "Computing iteration " << iteration << "..." << std::endl;
 
       const Index prev_sum = sum( f_K.getData() );
@@ -400,14 +398,14 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations )
          std::cerr << "ERROR: propatation over faces differs from the propagation over neighbor cells. Differing values are:\n";
          for( Index K = 0; K < f_K_view.getSize(); K++ )
             if( f_K_view[ K ] != f_K_test_view[ K ] )
-               std::cerr << "   rank = " << CommunicatorType::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n";
+               std::cerr << "   rank = " << TNL::MPI::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n";
          std::cerr.flush();
          TNL_ASSERT_TRUE( false, "test failed" );
       }
 
       // check if finished
       const bool done = sum( f_K.getData() ) == prev_sum || iteration > max_iterations;
-      CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
+      TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() );
    }
    while( all_done == false );
 
@@ -421,7 +419,7 @@ void configSetup( Config::ConfigDescription& config )
    config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" );
    config.addEntry< int >( "max-iterations", "Maximum number of iterations to compute", 100 );
    config.addDelimiter( "MPI settings:" );
-   CommunicatorType::configSetup( config );
+   TNL::MPI::configSetup( config );
 }
 
 int main( int argc, char* argv[] )
@@ -431,12 +429,12 @@ int main( int argc, char* argv[] )
 
    configSetup( conf_desc );
 
-   Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv);
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
 
    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
       return EXIT_FAILURE;
 
-   if( ! CommunicatorType::setup( parameters ) )
+   if( ! TNL::MPI::setup( parameters ) )
       return EXIT_FAILURE;
 
    const String inputFileName = parameters.getParameter< String >( "input-file" );
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index 8e4ac724954bcf92e169ae3bc67f11a533d37c04..2c0ba865069600a497932227759c3de5d9a3e9f6 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -1,4 +1,3 @@
-ADD_SUBDIRECTORY( Communicators )
 ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Functions )
 # Matrices are included from src/CMakeLists.txt
diff --git a/src/UnitTests/Communicators/CMakeLists.txt b/src/UnitTests/Communicators/CMakeLists.txt
deleted file mode 100644
index 1a3331c3a50990b2f0e0f3fa29e5b72891fe434a..0000000000000000000000000000000000000000
--- a/src/UnitTests/Communicators/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-if( ${BUILD_MPI} )
-   ADD_EXECUTABLE( MpiCommunicatorTest MpiCommunicatorTest.cpp )
-   TARGET_COMPILE_OPTIONS( MpiCommunicatorTest PRIVATE ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( MpiCommunicatorTest ${GTEST_BOTH_LIBRARIES} )
-
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MpiCommunicatorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME MpiCommunicatorTest COMMAND "mpirun" ${mpi_test_parameters})
-
-endif()
diff --git a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp b/src/UnitTests/Communicators/MpiCommunicatorTest.cpp
deleted file mode 100644
index b78011953e638a84796d5b30daa848f168274303..0000000000000000000000000000000000000000
--- a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/***************************************************************************
-                          MpiCommunicatorTest.h  -  description
-                             -------------------
-    begin                : Jul 10, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifdef HAVE_GTEST
-
-#include "gtest/gtest.h"
-#include <TNL/Communicators/MpiCommunicator.h>
-
-using namespace TNL;
-using namespace TNL::Communicators;
-
-// test fixture for typed tests
-template< typename Real >
-class MpiCommunicatorTest : public ::testing::Test
-{
-   protected:
-      using RealType = Real;
-      using CommunicatorType = MpiCommunicator;
-};
-
-// types for which MpiCommunicatorTest is instantiated
-using MpiCommunicatorTypes = ::testing::Types<
-   short,
-   int,
-   long,
-   float,
-   double
->;
-
-TYPED_TEST_SUITE( MpiCommunicatorTest, MpiCommunicatorTypes );
-
-TYPED_TEST( MpiCommunicatorTest, allReduce )
-{
-   using RealType = typename TestFixture::RealType;
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-   RealType a = CommunicatorType::GetRank();
-   RealType b = 0;
-   CommunicatorType::Allreduce( &a, &b, 1, MPI_MAX, MPI_COMM_WORLD );
-   EXPECT_EQ( b, CommunicatorType::GetSize() - 1  );
-}
-
-#endif // HAVE_GTEST
-
-#include "../main_mpi.h"
\ No newline at end of file
diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index fdde0a8b723b971d26304d129c9c3a64c3a3a862..efba5e50de2756a9f9335ecf18094e99689bad08 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -92,30 +92,39 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorUnaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorUnaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorVerticalOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    if( BUILD_CUDA )
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
 
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
    endif()
 endif()
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index 204bc6fe753c9f75b55bd3523eb1f708faf0b857..e25739afe1b8fb6608e5d319953f4d072f77f875 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -9,13 +9,14 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Containers/DistributedArray.h>
 #include <TNL/Containers/Partitioner.h>
 
+#include "VectorHelperFunctions.h"
+
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::MPI;
 
 /*
  * Light check of DistributedArray.
@@ -31,7 +32,6 @@ class DistributedArrayTest
 protected:
    using ValueType = typename DistributedArray::ValueType;
    using DeviceType = typename DistributedArray::DeviceType;
-   using CommunicatorType = typename DistributedArray::CommunicatorType;
    using IndexType = typename DistributedArray::IndexType;
    using DistributedArrayType = DistributedArray;
    using ArrayViewType = typename DistributedArrayType::LocalViewType;
@@ -39,44 +39,55 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
    DistributedArrayType distributedArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
+
+   // some arbitrary even value (but must be 0 if not distributed)
+   const int ghosts = (nproc > 1) ? 4 : 0;
 
    DistributedArrayTest()
    {
       using LocalRangeType = typename DistributedArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      distributedArray.setDistribution( localRange, globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
+      distributedArray.setDistribution( localRange, ghosts, globalSize, group );
+
+      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
+      distributedArray.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
 
       EXPECT_EQ( distributedArray.getLocalRange(), localRange );
+      EXPECT_EQ( distributedArray.getGhosts(), ghosts );
       EXPECT_EQ( distributedArray.getCommunicationGroup(), group );
    }
 };
 
 // types for which DistributedArrayTest is instantiated
 using DistributedArrayTypes = ::testing::Types<
-   DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >,
-   DistributedArray< double, Devices::Host, int, Communicators::NoDistrCommunicator >
+   DistributedArray< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
-   DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-   DistributedArray< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+   DistributedArray< double, Devices::Cuda, int >
 #endif
 >;
 
 TYPED_TEST_SUITE( DistributedArrayTest, DistributedArrayTypes );
 
-TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes )
+TYPED_TEST( DistributedArrayTest, checkLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
+   EXPECT_EQ( this->distributedArray.getLocalView().getSize(), this->distributedArray.getLocalRange().getSize() );
+   EXPECT_EQ( this->distributedArray.getConstLocalView().getSize(), this->distributedArray.getLocalRange().getSize() );
+   EXPECT_EQ( this->distributedArray.getLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts );
+   EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts );
+}
 
+TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes )
+{
    const int localSize = this->distributedArray.getLocalView().getSize();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedArray.getSize(), this->globalSize );
 }
@@ -88,14 +99,26 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal )
 
    this->distributedArray.setValue( 0.0 );
    ArrayType globalArray( this->globalSize );
-   globalArray.setValue( 1.0 );
+   setLinearSequence( globalArray );
    this->distributedArray.copyFromGlobal( globalArray );
+   this->distributedArray.waitForSynchronization();
 
-   ArrayViewType localArrayView = this->distributedArray.getLocalView();
-   auto globalView = globalArray.getConstView();
    const auto localRange = this->distributedArray.getLocalRange();
-   globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getEnd() - localRange.getBegin() );
+   ArrayViewType localArrayView;
+   localArrayView.bind( this->distributedArray.getLocalView().getData(), localRange.getSize() );
+   auto globalView = globalArray.getConstView();
+   globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getSize() );
    EXPECT_EQ( localArrayView, globalView );
+
+   // check ghost values
+   for( int o = 0; o < this->ghosts / 2; o++ ) {
+      const int left_i = localRange.getSize() + o;
+      const int left_gi = ((this->rank > 0) ? localRange.getBegin() : this->globalSize) - this->ghosts / 2 + o;
+      EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( left_i ), globalArray.getElement( left_gi ) );
+      const int right_i = localRange.getSize() + this->ghosts / 2 + o;
+      const int right_gi = ((this->rank < this->nproc - 1) ? localRange.getEnd() : 0) + o;
+      EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( right_i ), globalArray.getElement( right_gi ) );
+   }
 }
 
 TYPED_TEST( DistributedArrayTest, setLike )
@@ -126,23 +149,47 @@ TYPED_TEST( DistributedArrayTest, setValue )
    using ArrayType = typename TestFixture::ArrayType;
 
    this->distributedArray.setValue( 1.0 );
+   this->distributedArray.waitForSynchronization();
    ArrayViewType localArrayView = this->distributedArray.getLocalView();
    ArrayType expected( localArrayView.getSize() );
    expected.setValue( 1.0 );
    EXPECT_EQ( localArrayView, expected );
 }
 
+TYPED_TEST( DistributedArrayTest, setValueGhosts )
+{
+   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using ArrayType = typename TestFixture::ArrayType;
+
+   this->distributedArray.setValue( this->rank );
+   this->distributedArray.waitForSynchronization();
+   ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts();
+   ArrayType expected( localArrayView.getSize() );
+   expected.setValue( this->rank );
+
+   // set expected ghost values
+   const int left = (this->rank > 0) ? this->rank - 1 : this->nproc - 1;
+   const int right = (this->rank < this->nproc - 1) ? this->rank + 1 : 0;
+   for( int o = 0; o < this->ghosts / 2; o++ ) {
+      expected.setElement( this->distributedArray.getLocalRange().getSize() + o, left );
+      expected.setElement( this->distributedArray.getLocalRange().getSize() + this->ghosts / 2 + o, right );
+   }
+
+   EXPECT_EQ( localArrayView, expected );
+}
+
 TYPED_TEST( DistributedArrayTest, elementwiseAccess )
 {
    using ArrayViewType = typename TestFixture::ArrayViewType;
    using IndexType = typename TestFixture::IndexType;
 
    this->distributedArray.setValue( 0 );
+   this->distributedArray.waitForSynchronization();
    ArrayViewType localArrayView = this->distributedArray.getLocalView();
    const auto localRange = this->distributedArray.getLocalRange();
 
    // check initial value
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), 0 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), 0 );
@@ -152,13 +199,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    }
 
    // use setValue
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i + 1 );
    }
 
    // check set value
-   for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+   for( IndexType i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), i + 1 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 );
@@ -168,16 +215,17 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
    }
 
    this->distributedArray.setValue( 0 );
+   this->distributedArray.waitForSynchronization();
 
    // use operator[]
    if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
-      for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+      for( IndexType i = 0; i < localRange.getSize(); i++ ) {
          const IndexType gi = localRange.getGlobalIndex( i );
          this->distributedArray[ gi ] = i + 1;
       }
 
       // check set value
-      for( IndexType i = 0; i < localArrayView.getSize(); i++ ) {
+      for( IndexType i = 0; i < localRange.getSize(); i++ ) {
          const IndexType gi = localRange.getGlobalIndex( i );
          EXPECT_EQ( localArrayView.getElement( i ), i + 1 );
          EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 );
@@ -192,8 +240,9 @@ TYPED_TEST( DistributedArrayTest, copyConstructor )
 
    this->distributedArray.setValue( 1 );
    DistributedArrayType copy( this->distributedArray );
-   // Array has "binding" copy-constructor
-   //EXPECT_EQ( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() );
+   // no binding, but deep copy
+   EXPECT_NE( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() );
+   EXPECT_EQ( copy.getLocalView(), this->distributedArray.getLocalView() );
 }
 
 TYPED_TEST( DistributedArrayTest, copyAssignment )
@@ -219,7 +268,7 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators )
    v.setLike( u );
    w.setLike( u );
 
-   for( int i = 0; i < u.getLocalView().getSize(); i ++ ) {
+   for( int i = 0; i < localRange.getSize(); i ++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       u.setElement( gi, i );
       v.setElement( gi, i );
@@ -248,7 +297,7 @@ TYPED_TEST( DistributedArrayTest, containsValue )
 
    const auto localRange = this->distributedArray.getLocalRange();
 
-   for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i % 10 );
    }
@@ -266,7 +315,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue )
 
    const auto localRange = this->distributedArray.getLocalRange();
 
-   for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < localRange.getSize(); i++ ) {
       const IndexType gi = localRange.getGlobalIndex( i );
       this->distributedArray.setElement( gi, i % 10 );
    }
@@ -275,6 +324,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue )
       EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) );
 
    this->distributedArray.setValue( 100 );
+   this->distributedArray.waitForSynchronization();
    EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) );
 }
 
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index 2a1834f318fa616d25a77ccccbdb68bb1cc016a4..a90f09506d083db52e4b45ded4c0a49485d9d7e2 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -11,8 +11,6 @@
 
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Containers/DistributedVectorView.h>
 #include <TNL/Containers/Partitioner.h>
@@ -22,6 +20,7 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::MPI;
 
 /*
  * Light check of DistributedVector.
@@ -37,31 +36,40 @@ class DistributedVectorTest
 protected:
    using RealType = typename DistributedVector::RealType;
    using DeviceType = typename DistributedVector::DeviceType;
-   using CommunicatorType = typename DistributedVector::CommunicatorType;
    using IndexType = typename DistributedVector::IndexType;
    using DistributedVectorType = DistributedVector;
    using VectorViewType = typename DistributedVectorType::LocalViewType;
-   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >;
    using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >;
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
    DistributedVectorType v;
    DistributedVectorView v_view;
    HostDistributedVectorType v_host;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
 
    // should be small enough to have fast tests, but large enough to test
    // scan with multiple CUDA grids
    const int globalSize = 10000 * nproc;
 
+   // some arbitrary value (but must be 0 if not distributed)
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
    DistributedVectorTest()
    {
       using LocalRangeType = typename DistributedVector::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      v.setDistribution( localRange, globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
+      v.setDistribution( localRange, ghosts, globalSize, group );
+
+      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
+      using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
+      v.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
+      v_view.setSynchronizer( v.getSynchronizer() );
+      v_host.setSynchronizer( std::make_shared<HostSynchronizer>( localRange, ghosts / 2, group ) );
+
       v_view.bind( v );
       setConstantSequence( v, 1 );
    }
@@ -69,17 +77,17 @@ protected:
 
 // types for which DistributedVectorTest is instantiated
 using DistributedVectorTypes = ::testing::Types<
-   DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >,
-   DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >
+   DistributedVector< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
-   DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-   DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+   DistributedVector< double, Devices::Cuda, int >
 #endif
 >;
 
 TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
 
+// TODO: test that horizontal operations are computed for ghost values without synchronization
+
 TYPED_TEST( DistributedVectorTest, scan )
 {
    using RealType = typename TestFixture::DistributedVectorType::RealType;
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index d0979845376f9addb314ef0dc5f4beca86785126..b79b675cf7237950be48d68992f7bcd8c794b01d 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -13,11 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -62,11 +61,16 @@ protected:
    using RightReal = std::remove_const_t< typename Right::RealType >;
 #ifndef STATIC_VECTOR
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename Left::CommunicatorType;
-      static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value,
-                     "CommunicatorType must be the same for both Left and Right vectors." );
-      using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >;
-      using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >;
+      using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType >;
+      using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType >;
+
+      const MPI_Comm group = AllGroup();
+
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
+
+      // some arbitrary value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using LeftVector = Vector< LeftReal, typename Left::DeviceType, typename Left::IndexType >;
       using RightVector = Vector< RightReal, typename Right::DeviceType, typename Right::IndexType >;
@@ -90,14 +94,20 @@ protected:
       R2 = 2;
 #else
    #ifdef DISTRIBUTED_VECTOR
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
       using LocalRangeType = typename LeftVector::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group );
-
-      _L1.setDistribution( localRange, size, group );
-      _L2.setDistribution( localRange, size, group );
-      _R1.setDistribution( localRange, size, group );
-      _R2.setDistribution( localRange, size, group );
+      using Synchronizer = typename Partitioner< typename Left::IndexType >::template ArraySynchronizer< typename Left::DeviceType >;
+      const LocalRangeType localRange = Partitioner< typename Left::IndexType >::splitRange( size, group );
+
+      _L1.setDistribution( localRange, ghosts, size, group );
+      _L2.setDistribution( localRange, ghosts, size, group );
+      _R1.setDistribution( localRange, ghosts, size, group );
+      _R2.setDistribution( localRange, ghosts, size, group );
+
+      auto synchronizer = std::make_shared<Synchronizer>( localRange, ghosts / 2, group );
+      _L1.setSynchronizer( synchronizer );
+      _L2.setSynchronizer( synchronizer );
+      _R1.setSynchronizer( synchronizer );
+      _R2.setSynchronizer( synchronizer );
    #else
       _L1.setSize( size );
       _L2.setSize( size );
@@ -147,40 +157,23 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorPairs = ::testing::Types<
    #ifndef HAVE_CUDA
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >,
-
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Host, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >
+      Pair< DistributedVector<     int,   Devices::Host, int >,
+            DistributedVector<     short, Devices::Host, int > >,
+      Pair< DistributedVector<     int,   Devices::Host, int >,
+            DistributedVectorView< short, Devices::Host, int > >,
+      Pair< DistributedVectorView< int,   Devices::Host, int >,
+            DistributedVector<     short, Devices::Host, int > >,
+      Pair< DistributedVectorView< int,   Devices::Host, int >,
+            DistributedVectorView< short, Devices::Host, int > >
    #else
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::MpiCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVector<     int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVector<     short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-            DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >
+      Pair< DistributedVector<     int,   Devices::Cuda, int >,
+            DistributedVector<     short, Devices::Cuda, int > >,
+      Pair< DistributedVector<     int,   Devices::Cuda, int >,
+            DistributedVectorView< short, Devices::Cuda, int > >,
+      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
+            DistributedVector<     short, Devices::Cuda, int > >,
+      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
+            DistributedVectorView< short, Devices::Cuda, int > >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index 649de1cee9c70bc6a0989e6729543c4c23a1744b..32f2d52ba7d8cdba98eb671e57c5a7ae1de64583 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -2,6 +2,7 @@
 
 #include <TNL/Math.h>
 #include <TNL/TypeTraits.h>
+#include <TNL/Devices/Host.h>
 
 template< typename Vector >
 void setLinearSequence( Vector& deviceVector )
@@ -9,15 +10,17 @@ void setLinearSequence( Vector& deviceVector )
 #ifdef STATIC_VECTOR
    Vector a;
 #else
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #endif
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = gi;
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = -1;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = i;
@@ -62,10 +65,12 @@ void setNegativeLinearSequence( Vector& deviceVector )
    HostVector a;
    a.setLike( deviceVector );
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = -gi;
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = 1;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = -i;
@@ -85,10 +90,12 @@ void setOscilatingSequence( Vector& deviceVector,
    a.setLike( deviceVector );
 #endif
 #ifdef DISTRIBUTED_VECTOR
-   for( int i = 0; i < a.getLocalView().getSize(); i++ ) {
+   for( int i = 0; i < a.getLocalRange().getSize(); i++ ) {
       const auto gi = a.getLocalRange().getGlobalIndex( i );
       a[ gi ] = v * std::pow( -1, gi );
    }
+   for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ )
+      a.getLocalView()[ i ] = 42;  // dummy ghost value
 #else
    for( int i = 0; i < a.getSize(); i++ )
       a[ i ] = v * std::pow( -1, i );
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index a5beb58d96063d70658e3e317d99275d35036eb9..485265e4e9b0fa43b99b6c84fbf1da7d119b0e82 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -13,11 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -52,10 +51,17 @@ protected:
 #else
    using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >;
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename VectorOrView::CommunicatorType;
-      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
-      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
+
+      const MPI_Comm group = AllGroup();
+
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
+
+      // some arbitrary even value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
@@ -68,19 +74,13 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorTypes = ::testing::Types<
    #ifndef HAVE_CUDA
-      DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator >
+      DistributedVector<           double, Devices::Host, int >,
+      DistributedVectorView<       double, Devices::Host, int >,
+      DistributedVectorView< const double, Devices::Host, int >
    #else
-      DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+      DistributedVector<           double, Devices::Cuda, int >,
+      DistributedVectorView<       double, Devices::Cuda, int >,
+      DistributedVectorView< const double, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -173,14 +173,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
    #define SETUP_UNARY_VECTOR_TEST( size ) \
       using VectorType = typename TestFixture::VectorType;     \
       using VectorOrView = typename TestFixture::VectorOrView; \
-      using CommunicatorType = typename VectorOrView::CommunicatorType; \
-      const auto group = CommunicatorType::AllGroup; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       VectorType _V1, _V2;                                     \
-      _V1.setDistribution( localRange, size, group );          \
-      _V2.setDistribution( localRange, size, group );          \
+      _V1.setDistribution( localRange, this->ghosts, size, this->group ); \
+      _V2.setDistribution( localRange, this->ghosts, size, this->group ); \
+                                                               \
+      auto _synchronizer = std::make_shared<Synchronizer>( localRange, this->ghosts / 2, this->group ); \
+      _V1.setSynchronizer( _synchronizer );                    \
+      _V2.setSynchronizer( _synchronizer );                    \
                                                                \
       _V1 = 1;                                                 \
       _V2 = 2;                                                 \
@@ -194,15 +197,14 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       EXPECTED_VECTOR( TestFixture, function );                \
       using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \
       using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \
-      using CommunicatorType = typename VectorOrView::CommunicatorType; \
-      const auto group = CommunicatorType::AllGroup; \
       using LocalRangeType = typename VectorOrView::LocalRangeType; \
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \
                                                                \
       HostVector _V1h;                                         \
       HostExpectedVector expected_h;                           \
-      _V1h.setDistribution( localRange, size, group );         \
-      expected_h.setDistribution( localRange, size, group );   \
+      _V1h.setDistribution( localRange, this->ghosts, size, this->group ); \
+      expected_h.setDistribution( localRange, this->ghosts, size, this->group ); \
                                                                \
       const double h = (double) (end - begin) / size;          \
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \
@@ -211,10 +213,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
+      for( int i = localRange.getSize(); i < _V1h.getLocalView().getSize(); i++ ) \
+         _V1h.getLocalView()[ i ] = expected_h.getLocalView()[ i ] = 0;           \
                                                                \
       VectorType _V1; _V1 = _V1h;                              \
       VectorOrView V1( _V1 );                                  \
       ExpectedVector expected; expected = expected_h;          \
+                                                               \
+      auto _synchronizer = std::make_shared<Synchronizer>( localRange, this->ghosts / 2, this->group ); \
+      _V1.setSynchronizer( _synchronizer );                    \
+      expected.setSynchronizer( _synchronizer );               \
+      expected.startSynchronization();                         \
 
 #else
    #define SETUP_UNARY_VECTOR_TEST( size ) \
@@ -270,11 +279,8 @@ void expect_vectors_near( const Left& _v1, const Right& _v2 )
    using LeftNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Left::RealType > >;
    using RightNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Right::RealType > >;
 #ifdef DISTRIBUTED_VECTOR
-   using CommunicatorType = typename Left::CommunicatorType;
-   static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value,
-                  "CommunicatorType must be the same for both Left and Right vectors." );
-   using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >;
-   using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >;
+   using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >;
+   using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >;
 #else
    using LeftVector = Vector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >;
    using RightVector = Vector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >;
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index 3aa60e6123b06e8b59eccffb967cc157c786c2fa..f73b502ccc9ee12bb812acf963b680991c11a877 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -13,11 +13,10 @@
 #ifdef HAVE_GTEST
 
 #if defined(DISTRIBUTED_VECTOR)
-   #include <TNL/Communicators/MpiCommunicator.h>
-   #include <TNL/Communicators/NoDistrCommunicator.h>
    #include <TNL/Containers/DistributedVector.h>
    #include <TNL/Containers/DistributedVectorView.h>
    #include <TNL/Containers/Partitioner.h>
+   using namespace TNL::MPI;
 #elif defined(STATIC_VECTOR)
    #include <TNL/Containers/StaticVector.h>
 #else
@@ -53,10 +52,17 @@ protected:
 #else
    using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >;
    #ifdef DISTRIBUTED_VECTOR
-      using CommunicatorType = typename VectorOrView::CommunicatorType;
-      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
-      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >;
+      using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
+
+      const MPI_Comm group = AllGroup();
+
+      const int rank = GetRank(group);
+      const int nproc = GetSize(group);
+
+      // some arbitrary value (but must be 0 if not distributed)
+      const int ghosts = (nproc > 1) ? 4 : 0;
    #else
       using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >;
       template< typename Real >
@@ -76,11 +82,11 @@ protected:
       setLinearSequence( V1 );
 #else
    #ifdef DISTRIBUTED_VECTOR
-      const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
       using LocalRangeType = typename VectorOrView::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group );
-
-      _V1.setDistribution( localRange, size, group );
+      using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >;
+      const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, group );
+      _V1.setDistribution( localRange, ghosts, size, group );
+      _V1.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
    #else
       _V1.setSize( size );
    #endif
@@ -104,19 +110,13 @@ protected:
 #if defined(DISTRIBUTED_VECTOR)
    using VectorTypes = ::testing::Types<
    #ifndef HAVE_CUDA
-      DistributedVector<           double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Host, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator >
+      DistributedVector<           double, Devices::Host, int >,
+      DistributedVectorView<       double, Devices::Host, int >,
+      DistributedVectorView< const double, Devices::Host, int >
    #else
-      DistributedVector<           double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >,
-      DistributedVector<           double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView<       double, Devices::Cuda, int, Communicators::NoDistrCommunicator >,
-      DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator >
+      DistributedVector<           double, Devices::Cuda, int >,
+      DistributedVectorView<       double, Devices::Cuda, int >,
+      DistributedVectorView< const double, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index 5be285b5e56a9857509dd367e9246a4a99a5dbaf..f5fb11bdfa28777655558d0a36361a107ba8cd3e 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -58,13 +58,17 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArray_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArray_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
 endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 7377cbff2450debc69d05cc3e1fa69b7b20ba8e8..36c4ea5b7039974867f15697dc5c49f54bdcddd6 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -9,9 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
@@ -35,7 +32,6 @@ class DistributedNDArrayOverlaps_1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -46,17 +42,17 @@ protected:
    const int globalSize = 97;  // prime number to force non-uniform distribution
    const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} );
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArrayOverlaps_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( globalSize );
       distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -72,30 +68,14 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Host >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
-// TODO: does it make sense for NoDistrCommunicator?
-//   DistributedNDArray< NDArray< double,
-//                                SizesHolder< int, 0 >,
-//                                std::index_sequence< 0 >,
-//                                Devices::Host >,
-//                       Communicators::NoDistrCommunicator,
-//                       std::index_sequence< 2 > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
                                 Devices::Cuda >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
-// TODO: does it make sense for NoDistrCommunicator?
-//   DistributedNDArray< NDArray< double,
-//                                SizesHolder< int, 0 >,
-//                                std::index_sequence< 0 >,
-//                                Devices::Cuda >,
-//                       Communicators::NoDistrCommunicator,
-//                       std::index_sequence< 2 > >
 #endif
 >;
 
@@ -103,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index f1ac970eb2d26f268cc5bd294b05ffd5dcb78551..0b6838639f0c689f132fb4077c275f5edb6e0d92 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -9,9 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/DistributedNDArraySynchronizer.h>
@@ -35,7 +32,6 @@ class DistributedNDArrayOverlaps_semi1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -46,17 +42,17 @@ protected:
    const int globalSize = 97;  // prime number to force non-uniform distribution
    const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} );
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArrayOverlaps_semi1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
       distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -72,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Host >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
 #ifdef HAVE_CUDA
    ,
@@ -80,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Cuda >,
-                       Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
 #endif
 >;
@@ -89,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayType
 
 TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
 
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index a8d3bcdab1cf911cf5386d050e0cc29636340a86..e5519297133d8630e7a6b7c1ed1e68b828e8bd4c 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -9,9 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
@@ -34,7 +31,6 @@ class DistributedNDArray_1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -44,17 +40,17 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArray_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( globalSize );
       distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -69,25 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                Devices::Host >,
-                       Communicators::MpiCommunicator >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Host >,
-                       Communicators::NoDistrCommunicator >
+                                Devices::Host > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                Devices::Cuda >,
-                       Communicators::MpiCommunicator >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Cuda >,
-                       Communicators::NoDistrCommunicator >
+                                Devices::Cuda > >
 #endif
 >;
 
@@ -95,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
 }
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index 6f777c215aab824f8d7a2c2916bd459845194e55..e3cbb3223c9e411105a019dcefd4ba29c047c179 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -9,9 +9,6 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
 #include <TNL/Containers/ArrayView.h>
@@ -34,7 +31,6 @@ class DistributedNDArray_semi1D_test
 protected:
    using ValueType = typename DistributedNDArray::ValueType;
    using DeviceType = typename DistributedNDArray::DeviceType;
-   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
    using IndexType = typename DistributedNDArray::IndexType;
    using DistributedNDArrayType = DistributedNDArray;
 
@@ -44,17 +40,17 @@ protected:
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = TNL::MPI::AllGroup();
 
    DistributedNDArrayType distributedNDArray;
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
 
    DistributedNDArray_semi1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
       distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
       distributedNDArray.allocate();
@@ -69,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                Devices::Host >,
-                       Communicators::MpiCommunicator >
+                                Devices::Host > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                Devices::Cuda >,
-                       Communicators::NoDistrCommunicator >
+                                Devices::Cuda > >
 #endif
 >;
 
@@ -85,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes );
 
 TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
    const int localSize = localRange.getEnd() - localRange.getBegin();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
 }
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 65723ac889325a97ecc9db56081ce0d901d9e271..b713c8f0ca76d534b8abb903097b77b8bc8bd22b 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -58,4 +58,5 @@ if( ${BUILD_MPI} )
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedMatrixTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedMatrixTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" )
 endif()
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index 30a76f86a60c707926c6f2b7aa88de08964477c0..5e893e111221272912d8c874797b304ab7e68142 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -9,13 +9,12 @@
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Matrices/SparseMatrix.h>
 
 using namespace TNL;
+using namespace TNL::MPI;
 
 template< typename Vector >
 void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = 0 )
@@ -33,7 +32,7 @@ void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset =
 template< typename Matrix, typename RowCapacities >
 void setMatrix( Matrix& matrix, const RowCapacities& rowCapacities )
 {
-   using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential >, typename Matrix::CommunicatorType >;
+   using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential > >;
    using HostRowCapacities = typename RowCapacities::template Self< typename RowCapacities::RealType, TNL::Devices::Sequential >;
 
    HostMatrix hostMatrix;
@@ -66,20 +65,19 @@ class DistributedMatrixTest
 protected:
    using RealType = typename DistributedMatrix::RealType;
    using DeviceType = typename DistributedMatrix::DeviceType;
-   using CommunicatorType = typename DistributedMatrix::CommunicatorType;
    using IndexType = typename DistributedMatrix::IndexType;
    using DistributedMatrixType = DistributedMatrix;
 
    using RowCapacitiesVector = typename DistributedMatrixType::CompressedRowLengthsVector;
    using GlobalVector = Containers::Vector< RealType, DeviceType, IndexType >;
-   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >;
+   using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >;
 
    const int globalSize = 97;  // prime number to force non-uniform distribution
 
-   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+   const MPI_Comm group = AllGroup();
 
-   const int rank = CommunicatorType::GetRank(group);
-   const int nproc = CommunicatorType::GetSize(group);
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
 
    DistributedMatrixType matrix;
 
@@ -88,9 +86,9 @@ protected:
    DistributedMatrixTest()
    {
       using LocalRangeType = typename DistributedMatrix::LocalRangeType;
-      const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( globalSize, group );
       matrix.setDistribution( localRange, globalSize, globalSize, group );
-      rowCapacities.setDistribution( localRange, globalSize, group );
+      rowCapacities.setDistribution( localRange, 0, globalSize, group );
 
       EXPECT_EQ( matrix.getLocalRowRange(), localRange );
       EXPECT_EQ( matrix.getCommunicationGroup(), group );
@@ -101,12 +99,10 @@ protected:
 
 // types for which DistributedMatrixTest is instantiated
 using DistributedMatrixTypes = ::testing::Types<
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int > >
 #ifdef HAVE_CUDA
    ,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int > >
 #endif
 >;
 
@@ -114,11 +110,9 @@ TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes );
 
 TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes )
 {
-   using CommunicatorType = typename TestFixture::CommunicatorType;
-
    const int localSize = this->matrix.getLocalMatrix().getRows();
    int sumOfLocalSizes = 0;
-   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
    EXPECT_EQ( sumOfLocalSizes, this->globalSize );
    EXPECT_EQ( this->matrix.getRows(), this->globalSize );
 }
@@ -218,7 +212,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
 
    GlobalVector inVector( this->globalSize );
    inVector.setValue( 1 );
-   DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    this->matrix.vectorProduct( inVector, outVector );
 
    EXPECT_EQ( outVector, this->rowCapacities )
@@ -233,9 +227,9 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
    this->matrix.setRowCapacities( this->rowCapacities );
    setMatrix( this->matrix, this->rowCapacities );
 
-   DistributedVector inVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector inVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    inVector.setValue( 1 );
-   DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() );
+   DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() );
    this->matrix.vectorProduct( inVector, outVector );
 
    EXPECT_EQ( outVector, this->rowCapacities )
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
index 54071032c5ceba47386286a7fd30c317859a8da3..dccd68f23b4b8cf678d6a1f156e8f359bd15762b 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
@@ -1,9 +1,8 @@
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI  
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
@@ -12,30 +11,25 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
-typedef MpiCommunicator CommunicatorType;
-
-template<
-        typename MeshType,
-        typename CommunicatorType>
+template< typename MeshType >
 void SetUpDistributedGrid(DistributedMesh<MeshType> &distributedGrid, MeshType &globalGrid,int size,typename MeshType::CoordinatesType distribution )
 {
     typename MeshType::PointType globalOrigin;
     typename MeshType::PointType globalProportions;
     using DistributedMeshType = DistributedMesh< MeshType >;
-    
+
     globalOrigin.setValue( -0.5 );
     globalProportions.setValue( size );
 
     globalGrid.setDimensions( size );
     globalGrid.setDomain( globalOrigin,globalProportions );
-    
+
     distributedGrid.setDomainDecomposition( distribution );
-    distributedGrid.template setGlobalGrid<CommunicatorType>(globalGrid);
+    distributedGrid.setGlobalGrid(globalGrid);
     typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-    SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+    SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
     distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 }
 
@@ -44,47 +38,47 @@ void SetUpDistributedGrid(DistributedMesh<MeshType> &distributedGrid, MeshType &
 TEST(CutDistributedGirdTest_2D, IsInCut)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
             StaticVector<1,int>(5)
             );
 
-    if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%3==1)
+    if(TNL::MPI::GetRank()%3==1)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_2D, GloblaGridDimesion)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -92,24 +86,24 @@ TEST(CutDistributedGirdTest_2D, GloblaGridDimesion)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
 
 TEST(CutDistributedGirdTest_2D, IsDistributed)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(3,4));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -123,17 +117,17 @@ TEST(CutDistributedGirdTest_2D, IsDistributed)
 TEST(CutDistributedGirdTest_2D, IsNotDistributed)
 {
     typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(12,1));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(12,1));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -149,47 +143,47 @@ TEST(CutDistributedGirdTest_2D, IsNotDistributed)
 TEST(CutDistributedGirdTest_3D, IsInCut_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
             StaticVector<2,int>(2,2)
             );
 
-    if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%4==0)
+    if(TNL::MPI::GetRank()%4==0)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -197,24 +191,24 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
 
 TEST(CutDistributedGirdTest_3D, IsDistributed_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -228,17 +222,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_1D)
 TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<1,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<1,int>(2),
             StaticVector<2,int>(0,1),
@@ -254,48 +248,48 @@ TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D)
 TEST(CutDistributedGirdTest_3D, IsInCut_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    bool result=cutDistributedGrid.SetupByCut<CommunicatorType>(
+    bool result=cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
             StaticVector<1,int>(5)
             );
 
-    int rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
+    int rank=TNL::MPI::GetRank();
     if(rank>3 && rank<8)
     {
-        ASSERT_TRUE(result); 
+        ASSERT_TRUE(result);
     }
     else
     {
         ASSERT_FALSE(result);
-    }  
+    }
 }
 
 TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
@@ -303,7 +297,7 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
             ))
     {
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),2) << "Dimenze globálního gridu neodpovídajá řezu";
-        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; 
+        EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají";
         EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().y(),10) << "Rozměry globálního gridu neodpovídají";
     }
 }
@@ -311,17 +305,17 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D)
 TEST(CutDistributedGirdTest_3D, IsDistributed_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
@@ -335,17 +329,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_2D)
 TEST(CutDistributedGirdTest_3D, IsNotDistributed_2D)
 {
     typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType; 
+    typedef typename MeshType::CoordinatesType CoordinatesType;
     typedef DistributedMesh<MeshType> DistributedMeshType;
     typedef Grid<2,double,Host,int> CutGridType;
     typedef DistributedMesh<CutGridType> CutDistributedGridType;
 
     MeshType globalGrid;
     DistributedMeshType distributedGrid;
-    SetUpDistributedGrid<MeshType,CommunicatorType>(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12));
+    SetUpDistributedGrid<MeshType>(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12));
 
     CutDistributedGridType cutDistributedGrid;
-    if(cutDistributedGrid.SetupByCut<CommunicatorType>(
+    if(cutDistributedGrid.SetupByCut(
             distributedGrid,
             StaticVector<2,int>(0,1),
             StaticVector<1,int>(2),
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
index 4d5bb4baf66abbabcf6f989e4ed8389acf0d7b9a..9ad46b41221a4cb230ab67fdda50b3ef2636199d 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
@@ -6,7 +6,6 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Functions/CutMeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -18,9 +17,6 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
-
-typedef MpiCommunicator CommunicatorType;
 
 static const char* TEST_FILE_NAME = "test_CutDistributedMeshFunctionTest.tnl";
 
@@ -52,9 +48,9 @@ TEST(CutDistributedMeshFunction, 2D_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
 
@@ -73,14 +69,14 @@ TEST(CutDistributedMeshFunction, 2D_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -134,9 +130,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -154,14 +150,14 @@ TEST(CutDistributedMeshFunction, 3D_1_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(2),
             StaticVector<2,int>(1,0),
@@ -215,9 +211,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Data)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -235,14 +231,14 @@ TEST(CutDistributedMeshFunction, 3D_2_Data)
 
    DistributedMeshSynchronizer< DistributedMeshType > synchronizer;
    synchronizer.setDistributedGrid( &distributedGrid );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+   synchronizer.synchronize( *meshFunctionptr );
 
    //Prepare Mesh Function parts for Cut
    CutDistributedMeshType cutDistributedGrid;
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -302,9 +298,9 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -325,7 +321,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<1,int>(0),
@@ -338,7 +334,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -387,9 +383,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
 
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>( globalOriginalGrid );
+   distributedGrid.setGlobalGrid( globalOriginalGrid );
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -410,7 +406,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<2,int>(0,2),
@@ -423,7 +419,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -476,9 +472,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
    overlap.setValue(1);
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>(globalOriginalGrid);
+   distributedGrid.setGlobalGrid(globalOriginalGrid);
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -499,7 +495,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -512,7 +508,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization)
 
         DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer;
         synchronizer.setDistributedGrid( &cutDistributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( cutMeshFunction );
+        synchronizer.synchronize( cutMeshFunction );
 
         typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
         typename CutMeshType::Cell outEntity(*cutGrid);
@@ -563,9 +559,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
    overlap.setValue(1);
    DistributedMeshType distributedGrid;
    distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) );
-   distributedGrid.template setGlobalGrid<CommunicatorType>( globalOriginalGrid );
+   distributedGrid.setGlobalGrid( globalOriginalGrid );
    typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+   SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
    distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
    Pointers::SharedPointer<MeshType> originalGrid;
@@ -586,7 +582,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
    Pointers::SharedPointer<CutMeshType> cutGrid;
    cutGrid->setDistMesh(&cutDistributedGrid);
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<CommunicatorType, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
             *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(0,2),
             StaticVector<1,int>(1),
@@ -600,9 +596,8 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
         DistributedGridIO<MeshFunctionView<CutMeshType>,MpiIO> ::save(TEST_FILE_NAME, cutMeshFunction );
 
         //save globalgrid for debug render
-        typename CommunicatorType::CommunicationGroup *group;
-        group=(typename CommunicatorType::CommunicationGroup *)(cutDistributedGrid.getCommunicationGroup());
-        if(CommunicatorType::GetRank(*group)==0)
+        MPI_Comm group=cutDistributedGrid.getCommunicationGroup();
+        if(TNL::MPI::GetRank(group)==0)
         {
             File meshFile;
             meshFile.open( TEST_FILE_NAME+String("-mesh.tnl"),std::ios_base::out);
@@ -612,7 +607,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
 
     }
 
-   if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+   if(TNL::MPI::GetRank()==0)
    {
        Pointers::SharedPointer<CutMeshType> globalCutGrid;
        MeshFunctionView<CutMeshType> loadMeshFunctionptr;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
index 5d034087f35a6ba75a96d84e692d6876bfbcb374..6621a01dd971715e923a28b5faa173d1bac044d9 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
@@ -1,11 +1,10 @@
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
 #include <TNL/Functions/CutMeshFunction.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Meshes/Grid.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 
 #include "../../Functions/Functions.h"
 
@@ -14,7 +13,6 @@ using namespace TNL::Containers;
 using namespace TNL::Functions;
 using namespace TNL::Meshes;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
 TEST(CutMeshFunction, 2D)
@@ -28,12 +26,12 @@ TEST(CutMeshFunction, 2D)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,2> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -43,18 +41,18 @@ TEST(CutMeshFunction, 2D)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(0),
             StaticVector<1,int>(1),
             StaticVector<1,typename CutMeshType::IndexType>(5) );
@@ -62,13 +60,13 @@ TEST(CutMeshFunction, 2D)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
        typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
        typename CutMeshType::Cell outEntity(*cutGrid);
-       
+
         fromEntity.getCoordinates().x()=i;
         fromEntity.getCoordinates().y()=5;
         outEntity.getCoordinates().x()=i;
@@ -91,12 +89,12 @@ TEST(CutMeshFunction, 3D_1)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,3> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -106,18 +104,18 @@ TEST(CutMeshFunction, 3D_1)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator,MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<1,int>(1),
             StaticVector<2,int>(0,2),
             StaticVector<2,typename CutMeshType::IndexType>(5,5) );
@@ -125,13 +123,13 @@ TEST(CutMeshFunction, 3D_1)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
        typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
        typename CutMeshType::Cell outEntity(*cutGrid);
-       
+
         fromEntity.getCoordinates().x()=5;
         fromEntity.getCoordinates().y()=i;
         fromEntity.getCoordinates().z()=5;
@@ -154,12 +152,12 @@ TEST(CutMeshFunction, 3D_2)
    typedef typename MeshType::Cell Cell;
 
    typedef LinearFunction<double,3> LinearFunctionType;
-  
+
 
    //Original MeshFunciton --filed with linear function
    Pointers::SharedPointer<MeshType> originalGrid;
    Pointers::SharedPointer<MeshFunctionView<MeshType>> meshFunctionptr;
- 
+
    PointType origin;
    origin.setValue(-0.5);
    PointType proportions;
@@ -169,18 +167,18 @@ TEST(CutMeshFunction, 3D_2)
 
 
    DofType dof(originalGrid->template getEntitiesCount< Cell >());
-   dof.setValue(0); 
+   dof.setValue(0);
    meshFunctionptr->bind(originalGrid,dof);
 
    MeshFunctionEvaluator< MeshFunctionView<MeshType>, LinearFunctionType > linearFunctionEvaluator;
    Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
- 
-   //Prepare Mesh Function parts for Cut 
+
+   //Prepare Mesh Function parts for Cut
    Pointers::SharedPointer<CutMeshType> cutGrid;
    DofType cutDof(0);
-   bool inCut=CutMeshFunction<NoDistrCommunicator, MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
-            *meshFunctionptr,*cutGrid, cutDof, 
+   bool inCut=CutMeshFunction<MeshFunctionView<MeshType>,CutMeshType,DofType>::Cut(
+            *meshFunctionptr,*cutGrid, cutDof,
             StaticVector<2,int>(2,1),
             StaticVector<1,int>(0),
             StaticVector<1,typename CutMeshType::IndexType>(5) );
@@ -188,7 +186,7 @@ TEST(CutMeshFunction, 3D_2)
    ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu";
 
    MeshFunctionView<CutMeshType> cutMeshFunction;
-   cutMeshFunction.bind(cutGrid,cutDof); 
+   cutMeshFunction.bind(cutGrid,cutDof);
 
     for(int i=0;i<10;i++)
     {
@@ -196,7 +194,7 @@ TEST(CutMeshFunction, 3D_2)
         {
            typename MeshType::Cell fromEntity(meshFunctionptr->getMesh());
            typename CutMeshType::Cell outEntity(*cutGrid);
-           
+
             fromEntity.getCoordinates().x()=5;
             fromEntity.getCoordinates().y()=j;
             fromEntity.getCoordinates().z()=i;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
index 6b7c489af4acdf0596426409586b49fd22424178..11a85b68ded774ea4cbd8edc7ba49a5a18a64b88 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
@@ -6,7 +6,6 @@
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
@@ -18,7 +17,6 @@
 using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 
@@ -186,8 +184,6 @@ class ParameterProvider<3,Device>
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, typename Device>
 class TestDistributedGridIO
 {
@@ -227,9 +223,9 @@ class TestDistributedGridIO
         overlap.setValue(1);
         DistributedGridType distributedGrid;
         distributedGrid.setDomainDecomposition( parameters.getDistr() );
-        distributedGrid.template setGlobalGrid<CommunicatorType>( globalGrid );
+        distributedGrid.setGlobalGrid( globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         //std::cout << distributedGrid.printProcessDistr() <<std::endl;
@@ -249,8 +245,8 @@ class TestDistributedGridIO
 
 
        //create similar local mesh function and evaluate linear function on it
-        PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup));
-        PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));;
+        PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank());
+        PointType localProportions=parameters.getProportions(TNL::MPI::GetRank());
 
         Pointers::SharedPointer<MeshType>  localGridptr;
         localGridptr->setDimensions(localProportions);
@@ -313,14 +309,14 @@ class TestDistributedGridIO
         overlap.setValue(1);
         DistributedGridType distributedGrid;
         distributedGrid.setDomainDecomposition( parameters.getDistr() );
-        distributedGrid.template setGlobalGrid<CommunicatorType>( globalGrid );
+        distributedGrid.setGlobalGrid( globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         //save files from local mesh
-        PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup));
-        PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));;
+        PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank());
+        PointType localProportions=parameters.getProportions(TNL::MPI::GetRank());
 
         Pointers::SharedPointer<MeshType> localGridptr;
         localGridptr->setDimensions(localProportions);
@@ -355,7 +351,7 @@ class TestDistributedGridIO
 
         DistributedMeshSynchronizer< DistributedGridType > synchronizer;
         synchronizer.setDistributedGrid( &distributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
+        synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
 
         //Crete "distributedgrid driven" grid filed by evaluated linear function
         Pointers::SharedPointer<MeshType> gridptr;
@@ -367,7 +363,7 @@ class TestDistributedGridIO
         meshFunctionptr->bind(gridptr,dof);
 
         linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
-        synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
+        synchronizer.synchronize( *meshFunctionptr );
 
         for(int i=0;i<dof.getSize();i++)
         {
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
index 5bbad8f03b46fe69ec3237136503452f5f090b40..00705c31ffc50de1447f6a50e22a3d9616e500a5 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
@@ -14,7 +14,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 
@@ -24,13 +23,10 @@
 using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, typename Device>
 class TestDistributedGridMPIIO{
     public:
@@ -63,9 +59,9 @@ class TestDistributedGridMPIIO{
         globalGrid->setDomain(globalOrigin,globalProportions);
 
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         ///std::cout << distributedGrid.printProcessDistr() <<std::endl;
@@ -84,7 +80,7 @@ class TestDistributedGridMPIIO{
         DistributedGridIO<MeshFunctionType,MpiIO> ::save(FileName, *meshFunctionptr );
 
        //first process compare results
-       if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+       if(TNL::MPI::GetRank()==0)
        {
             DofType globalEvaluatedDof(globalGrid->template getEntitiesCount< Cell >());
 
@@ -131,15 +127,15 @@ class TestDistributedGridMPIIO{
         CoordinatesType overlap;
         overlap.setValue(1);
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
         String FileName=String("test-file-mpiio-load.tnl");
 
         //Prepare file
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+        if(TNL::MPI::GetRank()==0)
         {
             DofType saveDof(globalGrid->template getEntitiesCount< Cell >());
 
@@ -165,7 +161,7 @@ class TestDistributedGridMPIIO{
 
         DistributedMeshSynchronizer< DistributedGridType > synchronizer;
         synchronizer.setDistributedGrid( &distributedGrid );
-        synchronizer.template synchronize<CommunicatorType>( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
+        synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof
 
         Pointers::SharedPointer<MeshType> evalGridPtr;
         Pointers::SharedPointer<MeshFunctionType> evalMeshFunctionptr;
@@ -176,14 +172,14 @@ class TestDistributedGridMPIIO{
         evalMeshFunctionptr->bind(evalGridPtr,evalDof);
 
         linearFunctionEvaluator.evaluateAllEntities(evalMeshFunctionptr , linearFunctionPtr);
-        synchronizer.template synchronize<CommunicatorType>( *evalMeshFunctionptr );
+        synchronizer.synchronize( *evalMeshFunctionptr );
 
         for(int i=0;i<evalDof.getSize();i++)
         {
             EXPECT_EQ( evalDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
 
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+        if(TNL::MPI::GetRank()==0)
         {
             EXPECT_EQ( std::remove( FileName.getString()) , 0 );
         }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
index 7cb44ef229fa58399572a0916c6b76dcaaaea527..9a3952bc3f9f09e970dd9c016810e39d5929dadb 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
@@ -7,12 +7,11 @@
  ***************************************************************************/
 
 
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -26,7 +25,6 @@ using namespace TNL::Meshes;
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 
 
 template<typename DofType>
@@ -44,13 +42,13 @@ void check_Boundary_1D(int rank, int nproc, const DofType& dof, typename DofType
         EXPECT_EQ( dof[0], expectedValue) << "Left boundary test failed";
         return;
     }
-    
+
     if(rank==(nproc-1))//Right
     {
         EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Right boundary test failed";
         return;
     }
-    
+
 };
 
 template<typename DofType>
@@ -61,15 +59,15 @@ void check_Overlap_1D(int rank, int nproc, const DofType& dof, typename DofType:
         EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Left boundary node overlap test failed";
         return;
     }
-    
+
     if( rank == ( nproc - 1 ) )
     {
         EXPECT_EQ( dof[0], expectedValue) << "Right boundary node overlap test failed";
         return;
     }
-    
+
     EXPECT_EQ( dof[0], expectedValue) << "left overlap test failed";
-    EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed";    
+    EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed";
 };
 
 template<typename DofType>
@@ -80,25 +78,24 @@ void check_Inner_1D(int rank, int nproc, const DofType& dof, typename DofType::R
 };
 
 /*
- * Light check of 1D distributed grid and its synchronization. 
+ * Light check of 1D distributed grid and its synchronization.
  * Number of process is not limited.
  * Overlap is limited to 1
  * Only double is tested as dof Real type -- it may be changed, extend test
  * Global size is hardcoded as 10 -- it can be changed, extend test
  */
 
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<1,double,Host,int> GridType;
 typedef MeshFunctionView< GridType > MeshFunctionType;
 typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType;
 typedef Vector< double,Host,int> DofType;
 typedef Vector< bool, Host, int > MaskDofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
-     
+
 class DistributedGridTest_1D : public ::testing::Test
 {
    protected:
@@ -123,14 +120,14 @@ class DistributedGridTest_1D : public ::testing::Test
       void SetUp()
       {
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
          GridType globalGrid;
 
-         globalOrigin.x()=-0.5;    
+         globalOrigin.x()=-0.5;
          globalProportions.x()=size;
 
 
@@ -142,9 +139,9 @@ class DistributedGridTest_1D : public ::testing::Test
          distributedGrid=new DistributedGridType();
 
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
-         //distributedGrid->setupGrid(*gridptr);    
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         distributedGrid->setGlobalGrid( globalGrid );
+         //distributedGrid->setupGrid(*gridptr);
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
 
@@ -155,14 +152,14 @@ class DistributedGridTest_1D : public ::testing::Test
 
          constFunctionPtr->Number=rank;
       }
-      
+
       void SetUpPeriodicBoundaries()
       {
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
-         distributedGrid->setupGrid(*gridptr);         
+         distributedGrid->setupGrid(*gridptr);
       }
 
       void TearDown()
@@ -209,7 +206,7 @@ TEST_F(DistributedGridTest_1D, evaluateInteriorEntities)
    check_Boundary_1D(rank, nproc, dof, -1);
    check_Overlap_1D(rank, nproc, dof, -1);
    check_Inner_1D(rank, nproc, dof, rank);
-}    
+}
 
 TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
 {
@@ -217,7 +214,7 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
+   synchronizer.synchronize( *meshFunctionPtr );
 
    if(rank!=0) {
       EXPECT_EQ((dof)[0],rank-1)<< "Left Overlap was filled by wrong process.";
@@ -229,12 +226,12 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest )
 
 TEST_F(DistributedGridTest_1D, EvaluateLinearFunction )
 {
-   //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) 
+   //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid)
    setDof_1D(dof,-1);
    linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr);
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
+   synchronizer.synchronize( *meshFunctionPtr );
 
    auto entity = gridptr->template getEntity< Cell >(0);
    entity.refresh();
@@ -250,7 +247,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -258,13 +255,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask )
    maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridptr, dof );
    maskPointer->bind( gridptr, maskDofs );
-   
+
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true );
+   synchronizer.synchronize( *meshFunctionPtr, true );
 
    if( rank == 0 ) {
       EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process.";
@@ -279,7 +276,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -287,14 +284,14 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask )
    maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridptr, dof );
    maskPointer->bind( gridptr, maskDofs );
-   
+
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr, constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true, maskPointer );
+   synchronizer.synchronize( *meshFunctionPtr, true, maskPointer );
    if( rank == 0 ) {
       EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process.";
    }
@@ -310,7 +307,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -325,9 +322,9 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
    TNL_MPI_PRINT( "#### " << dof );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
+   meshFunctionPtr->synchronize( true, maskPointer );
    TNL_MPI_PRINT( ">>> " << dof );
-   
+
    if( rank == 0 )
       EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process.";
    if( rank == nproc-1 )
@@ -339,7 +336,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask )
    // Setup periodic boundaries
    // TODO: I do not know how to do it better with GTEST
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -350,27 +347,27 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask )
 
    setDof_1D( dof, -rank-1 );
    maskDofs.setValue( true );
-   maskDofs.setElement( 1, false );   
+   maskDofs.setElement( 1, false );
    maskDofs.setElement( dof.getSize() - 2, false );
    //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
       EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process.";
    if( rank == nproc-1 )
-      EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process.";   
-   
+      EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process.";
+
 }
 */
 
 TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridptr);
@@ -382,13 +379,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest )
 
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr, true );
+   synchronizer.synchronize( *meshFunctionPtr, true );
 
    auto entity = gridptr->template getEntity< Cell >( 0 );
    auto entity2= gridptr->template getEntity< Cell >( (dof).getSize() - 1 );
    entity.refresh();
    entity2.refresh();
-   
+
    if( rank == 0 ) {
       EXPECT_EQ( meshFunctionPtr->getValue(entity), 9 ) << "Linear function Overlap error on left Edge.";
    }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
index 71370cae2d0530f871cc4148ac0f06b58696e3c0..1f02dd2364e25a3fa40cf457f9e3f7ad353199ac 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
@@ -7,14 +7,13 @@
  ***************************************************************************/
 
 
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 
@@ -25,10 +24,9 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
- 
+
 
 template<typename DofType>
 void setDof_2D( DofType &dof, typename DofType::RealType value )
@@ -46,7 +44,7 @@ void checkLeftEdge( const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxy;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin;i<end;i++ )
             EXPECT_EQ( dof[maxx*i], expectedValue) << "Left Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -60,8 +58,8 @@ void checkRightEdge(const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxy;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
-    for( int i = begin; i < end; i++ ) 
+
+    for( int i = begin; i < end; i++ )
             EXPECT_EQ( dof[maxx*i+(maxx-1)], expectedValue) << "Right Edge test failed " << i <<" " << maxx << " "<< maxy;
 }
 
@@ -74,7 +72,7 @@ void checkUpEdge( const GridType &grid, const DofType &dof, bool with_first, boo
     int end = maxx;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin; i<end; i++ )
             EXPECT_EQ( dof[i], expectedValue) << "Up Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -88,7 +86,7 @@ void checkDownEdge( const GridType &grid, const DofType &dof, bool with_first, b
     int end = maxx;
     if( !with_first ) begin++;
     if( !with_last ) end--;
-    
+
     for( int i=begin; i<end; i++ )
             EXPECT_EQ( dof[maxx*(maxy-1)+i], expectedValue) << "Down Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -102,7 +100,7 @@ void checkLeftBoundary( const GridType &grid, const DofType &dof, bool with_firs
    int end = maxy - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
+
    for( int i=begin;i<end;i++ )
       EXPECT_EQ( dof[ maxx * i + 1 ], expectedValue) << "Left Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -116,8 +114,8 @@ void checkRightBoundary(const GridType &grid, const DofType &dof, bool with_firs
    int end = maxy - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
-   for( int i = begin; i < end; i++ ) 
+
+   for( int i = begin; i < end; i++ )
      EXPECT_EQ( dof[ maxx * i + ( maxx - 2 ) ], expectedValue) << "Right Edge test failed " << i <<" " << maxx << " "<< maxy;
 }
 
@@ -130,7 +128,7 @@ void checkUpBoundary( const GridType &grid, const DofType &dof, bool with_first,
    int end = maxx - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-    
+
    for( int i=begin; i<end; i++ )
       EXPECT_EQ( dof[ maxx + i ], expectedValue) << "Up Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -144,7 +142,7 @@ void checkDownBoundary( const GridType &grid, const DofType &dof, bool with_firs
    int end = maxx - 1;
    if( !with_first ) begin++;
    if( !with_last ) end--;
-   
+
    for( int i=begin; i<end; i++ )
       EXPECT_EQ( dof[ maxx * ( maxy-2 ) + i ], expectedValue) << "Down Edge test failed " << i<<" " << maxx << " "<< maxy;
 }
@@ -176,51 +174,51 @@ void checkCorner(const GridType &grid, const DofType &dof, bool up, bool left, t
 /*expecting 9 processes*/
 template<typename DofType,typename GridType>
 void check_Boundary_2D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue)
-{    
+{
 
     if(rank==0)//Up Left
     {
         checkUpEdge(grid,dof,true,false,expectedValue);//posledni je overlap
         checkLeftEdge(grid,dof,true,false, expectedValue);//posledni je overlap
     }
-    
+
     if(rank==1)//Up Center
     {
         checkUpEdge(grid,dof,false,false, expectedValue);//prvni a posledni je overlap
     }
-    
+
     if(rank==2)//Up Right
     {
         checkUpEdge(grid,dof,false,true,expectedValue);//prvni je overlap
         checkRightEdge(grid,dof,true,false,expectedValue);//posledni je overlap
     }
-    
+
     if(rank==3)//Center Left
     {
         checkLeftEdge(grid,dof,false,false,expectedValue);//prvni a posledni je overlap
     }
-    
+
     if(rank==4)//Center Center
     {
         //No boundary
     }
-    
+
     if(rank==5)//Center Right
     {
         checkRightEdge(grid,dof,false,false,expectedValue);
     }
-    
+
     if(rank==6)//Down Left
     {
         checkDownEdge(grid,dof,true,false,expectedValue);
         checkLeftEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==7) //Down Center
     {
         checkDownEdge(grid,dof,false,false,expectedValue);
     }
-    
+
     if(rank==8) //Down Right
     {
             checkDownEdge(grid,dof,false,true,expectedValue);
@@ -241,27 +239,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena
         checkRightEdge(grid,dof,false,true,expectedValue);
         checkDownEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==1)//Up Center
     {
         checkDownEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==2)//Up Right
     {
         checkDownEdge(grid,dof,true,false,expectedValue);//prvni je overlap
         checkLeftEdge(grid,dof,false,true,expectedValue);
     }
-    
+
     if(rank==3)//Center Left
     {
         checkUpEdge(grid,dof,false,true,expectedValue);
         checkDownEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==4)//Center Center
     {
         checkUpEdge(grid,dof,true,true,expectedValue);
@@ -269,27 +267,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena
         checkRightEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==5)//Center Right
     {
         checkUpEdge(grid,dof,true,false,expectedValue);
         checkDownEdge(grid,dof,true,false,expectedValue);
         checkLeftEdge(grid,dof,true,true,expectedValue);
     }
-    
+
     if(rank==6)//Down Left
     {
         checkUpEdge(grid,dof,false,true,expectedValue);
         checkRightEdge(grid,dof,true,false,expectedValue);
     }
-    
+
     if(rank==7) //Down Center
     {
         checkUpEdge(grid,dof,true,true,expectedValue);
         checkLeftEdge(grid,dof,true,false,expectedValue);
         checkRightEdge(grid,dof,true,false,expectedValue);
     }
-    
+
     if(rank==8) //Down Right
     {
         checkUpEdge(grid,dof,true,false,expectedValue);
@@ -310,26 +308,25 @@ void check_Inner_2D(int rank, const GridType& grid, const DofType& dof, typename
 }
 
 /*
- * Light check of 2D distributed grid and its synchronization. 
+ * Light check of 2D distributed grid and its synchronization.
  * expected 9 processes
  */
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<2,double,Host,int> GridType;
 typedef MeshFunctionView<GridType> MeshFunctionType;
 typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType;
 typedef Vector<double,Host,int> DofType;
 typedef Vector< bool, Host, int > MaskDofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
 
 class DistributedGridTest_2D : public ::testing::Test
 {
-    
+
    public:
-      
+
       using CoordinatesType = typename GridType::CoordinatesType;
 
       DistributedGridType *distributedGrid;
@@ -347,20 +344,20 @@ class DistributedGridTest_2D : public ::testing::Test
       Pointers::SharedPointer< LinearFunction<double,2>, Host > linearFunctionPtr;
 
       int rank;
-      int nproc;    
+      int nproc;
 
       void SetUp()
       {
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
          GridType globalGrid;
 
          globalOrigin.x()=-0.5;
-         globalOrigin.y()=-0.5;    
+         globalOrigin.y()=-0.5;
          globalProportions.x()=size;
          globalProportions.y()=size;
 
@@ -369,9 +366,9 @@ class DistributedGridTest_2D : public ::testing::Test
 
          distributedGrid=new DistributedGridType();
          distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3 ) );
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
+         distributedGrid->setGlobalGrid( globalGrid );
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
          distributedGrid->setupGrid(*gridPtr);
@@ -422,17 +419,17 @@ TEST_F(DistributedGridTest_2D, evaluateInteriorEntities)
     check_Boundary_2D(rank, *gridPtr, *dof, -1);
     check_Overlap_2D(rank, *gridPtr, *dof, -1);
     check_Inner_2D(rank, *gridPtr, *dof, rank);
-}    
+}
 
 TEST_F(DistributedGridTest_2D, LinearFunctionTest)
 {
-    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) 
+    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid)
     setDof_2D(*dof,-1);
     linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr);
     Synchronizer synchronizer;
     synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
-    
+    synchronizer.synchronize( *meshFunctionPtr );
+
     int count =gridPtr->template getEntitiesCount< Cell >();
     for(int i=0;i<count;i++)
     {
@@ -449,17 +446,17 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    Synchronizer synchronizer;
    synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() );
-   synchronizer.template synchronize<CommunicatorType>( *meshFunctionPtr );
-    
+   synchronizer.synchronize( *meshFunctionPtr );
+
    // checkNeighbor_2D(rank, *gridPtr, *dof);
-   
+
     if(rank==0)//Up Left
     {
         checkRightEdge(*gridPtr, *dof, true,  false, 1 );
         checkDownEdge( *gridPtr, *dof, true,  false, 3 );
         checkCorner(   *gridPtr, *dof, false, false, 4 );
     }
-    
+
     if(rank==1)//Up Center
     {
         checkLeftEdge( *gridPtr, *dof, true,  false, 0 );
@@ -468,14 +465,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge( *gridPtr, *dof, false, false, 4 );
         checkCorner(   *gridPtr, *dof, false, false, 5 );
     }
-    
+
     if(rank==2)//Up Right
     {
         checkLeftEdge( *gridPtr, *dof, true,  false, 1 );
         checkCorner(   *gridPtr, *dof, false, true,  4 );
         checkDownEdge( *gridPtr, *dof, false, true,  5 );
     }
-    
+
     if(rank==3)//Center Left
     {
         checkUpEdge(    *gridPtr, *dof, true,  false, 0 );
@@ -484,7 +481,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge(  *gridPtr, *dof, true,  false, 6 );
         checkCorner(    *gridPtr, *dof, false, false, 7 );
     }
-    
+
     if(rank==4)//Center Center
     {
         checkCorner(    *gridPtr, *dof, true,  true,  0 );
@@ -496,7 +493,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkDownEdge(  *gridPtr, *dof, false, false, 7 );
         checkCorner(    *gridPtr, *dof, false, false, 8 );
     }
-    
+
     if(rank==5)//Center Right
     {
         checkCorner(   *gridPtr, *dof, true,  true,  1 );
@@ -505,14 +502,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkCorner(   *gridPtr, *dof, false, true,  7 );
         checkDownEdge( *gridPtr, *dof, false, true,  8 );
     }
-    
+
     if(rank==6)//Down Left
     {
         checkUpEdge(    *gridPtr, *dof, true,  false, 3 );
         checkCorner(    *gridPtr, *dof, true,  false, 4 );
         checkRightEdge( *gridPtr, *dof, false, true,  7 );
     }
-    
+
     if(rank==7) //Down Center
     {
         checkCorner(    *gridPtr, *dof, true,  true,  3 );
@@ -521,77 +518,77 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest )
         checkLeftEdge(  *gridPtr, *dof, false, true,  6 );
         checkRightEdge( *gridPtr, *dof, false, true,  8 );
     }
-    
+
     if(rank==8) //Down Right
     {
         checkCorner(   *gridPtr, *dof, true,  true, 4 );
         checkUpEdge(   *gridPtr, *dof, false, true, 5 );
         checkLeftEdge( *gridPtr, *dof, false, true, 7 );
-    }   
+    }
 }
 
-// TODO: Fix tests for periodic BC - 
+// TODO: Fix tests for periodic BC -
 // checkLeftBoundary -> checkLeft Overlap etc. for direction BoundaryToOverlap
 // Fix the tests with mask to work with the direction OverlapToBoundary
 /*
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
    dof->setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true );
-   
+   meshFunctionPtr->synchronize( true );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -609,10 +606,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveMask )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -620,13 +617,13 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
+   meshFunctionPtr->synchronize( true, maskPointer );
 
    if( rank == 0 )
    {
@@ -634,39 +631,39 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -684,10 +681,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiveMaskOnLeft )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -695,7 +692,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -711,47 +708,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, 0 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, 3 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  6 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -769,10 +766,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskOnRight )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -780,7 +777,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -796,47 +793,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, 2 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, 5 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -854,10 +851,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskUp )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -865,7 +862,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -881,47 +878,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, 0 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, 1 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, 2 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, -1 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -939,10 +936,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
 TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskDown )
 {
    // Setup periodic boundaries
-   // TODO: I do not know how to do it better with GTEST - additional setup 
+   // TODO: I do not know how to do it better with GTEST - additional setup
    // of the periodic boundaries
    typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-   SubdomainOverlapsGetter< GridType, CommunicatorType >::
+   SubdomainOverlapsGetter< GridType >::
       getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 );
    distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
    distributedGrid->setupGrid(*gridPtr);
@@ -950,7 +947,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() );
    meshFunctionPtr->bind( gridPtr, *dof );
    maskPointer->bind( gridPtr, maskDofs );
-   
+
    //Expecting 9 processes
    setDof_2D(*dof, -rank-1 );
    maskDofs.setValue( true );
@@ -966,47 +963,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
    }
    constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr );
    meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary );
-   meshFunctionPtr->template synchronize<CommunicatorType>( true, maskPointer );
-   
+   meshFunctionPtr->synchronize( true, maskPointer );
+
    if( rank == 0 )
    {
       SCOPED_TRACE( "Up Left" );
       checkLeftBoundary( *gridPtr, *dof, false,  true, -3 );
       checkUpBoundary(   *gridPtr, *dof, false,  true, -7 );
    }
-    
+
    if( rank == 1 )
    {
       SCOPED_TRACE( "Up Center" );
       checkUpBoundary( *gridPtr, *dof, true, true, -8 );
    }
-    
+
    if( rank == 2 )
    {
       SCOPED_TRACE( "Up Right" );
       checkRightBoundary( *gridPtr, *dof, false, true, -1 );
       checkUpBoundary(    *gridPtr, *dof, true, false, -9 );
    }
-    
+
    if( rank == 3 )
    {
       SCOPED_TRACE( "Center Left" );
       checkLeftBoundary( *gridPtr, *dof, true, true, -6 );
-   } 
-        
+   }
+
    if( rank == 5 )
    {
       SCOPED_TRACE( "Center Right" );
       checkRightBoundary( *gridPtr, *dof, true, true, -4 );
    }
-    
+
    if( rank == 6 )
    {
       SCOPED_TRACE( "Down Left" );
       checkDownBoundary( *gridPtr, *dof, false,  true, 6 );
       checkLeftBoundary( *gridPtr, *dof, true,  false,  -9 );
    }
-    
+
    if( rank == 7 )
    {
       SCOPED_TRACE( "Down Center" );
@@ -1020,7 +1017,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv
       checkRightBoundary( *gridPtr, *dof, true, false, -7 );
    }
 }
-*/ 
+*/
 #endif
 
 #endif
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
index 765341c1e5c2361c113ee0c2a83a0e435a3ebf3c..4f552dee5455c576111c60d783aee34d45aeb213 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
@@ -1,9 +1,8 @@
-#ifdef HAVE_GTEST 
+#ifdef HAVE_GTEST
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
+#ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -16,8 +15,7 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
-using namespace TNL::Meshes::DistributedMeshes; 
+using namespace TNL::Meshes::DistributedMeshes;
 
 template<typename DofType>
 void setDof_3D(DofType &dof, typename DofType::RealType value)
@@ -49,14 +47,14 @@ void checkConner(const GridType &grid, const DofType &dof,bool bottom, bool nort
 {
     int i=getAdd(grid,bottom,north,west);
     EXPECT_EQ( dof[i], expectedValue) << "Conner test failed";
-    
+
 }
 
 template<typename DofType,typename GridType>
 void checkXDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool north, typename DofType::RealType expectedValue)
 {
-    int add=getAdd(grid,bottom,north,true);        
-    for(int i=1;i<grid.getDimensions().x()-1;i++) 
+    int add=getAdd(grid,bottom,north,true);
+    for(int i=1;i<grid.getDimensions().x()-1;i++)
             EXPECT_EQ( dof[i+add], expectedValue) << "X direction Edge test failed " << i;
 }
 
@@ -65,7 +63,7 @@ template<typename DofType,typename GridType>
 void checkYDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool west, typename DofType::RealType expectedValue)
 {
     int add=getAdd(grid,bottom,true,west);
-    for(int i=1;i<grid.getDimensions().y()-1;i++) 
+    for(int i=1;i<grid.getDimensions().y()-1;i++)
             EXPECT_EQ( dof[grid.getDimensions().x()*i+add], expectedValue) << "Y direction Edge test failed " << i;
 }
 
@@ -73,7 +71,7 @@ template<typename DofType,typename GridType>
 void checkZDirectionEdge(const GridType &grid, const DofType &dof, bool north, bool west, typename DofType::RealType expectedValue)
 {
     int add=getAdd(grid,true,north,west);
-    for(int i=1;i<grid.getDimensions().z()-1;i++) 
+    for(int i=1;i<grid.getDimensions().z()-1;i++)
             EXPECT_EQ( dof[grid.getDimensions().y()*grid.getDimensions().x()*i+add], expectedValue) << "Z direction Edge test failed " << i;
 }
 
@@ -125,7 +123,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
         checkZFace(grid, dof, true, expectedValue);
-    }    
+    }
 
     if(rank==1)//Bottom North Center
     {
@@ -199,7 +197,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkZDirectionEdge(grid,dof,true,true,expectedValue);
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
-    }    
+    }
 
     if(rank==10)//Center North Center
     {
@@ -257,7 +255,7 @@ void check_Boundary_3D(int rank, const GridType &grid, const DofType &dof, typen
         checkXFace(grid, dof, true, expectedValue);
         checkYFace(grid, dof, true, expectedValue);
         checkZFace(grid, dof, false, expectedValue);
-    }    
+    }
 
     if(rank==19)//Top North Center
     {
@@ -406,8 +404,8 @@ void CheckXFaceNode_Overlap(const GridType &grid, const DofType &dof,bool west,
    checkXFace(grid, dof, !west, expectedValue);
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
-   checkZFace(grid, dof, false, expectedValue);    
-   checkZFace(grid, dof, true, expectedValue);        
+   checkZFace(grid, dof, false, expectedValue);
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -429,7 +427,7 @@ void CheckYFaceNode_Overlap(const GridType &grid, const DofType &dof,bool north,
    checkXFace(grid, dof, true, expectedValue);
    checkYFace(grid, dof, !north, expectedValue);
    checkZFace(grid, dof, false, expectedValue);
-   checkZFace(grid, dof, true, expectedValue);    
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -451,7 +449,7 @@ void CheckZFaceNode_Overlap(const GridType &grid, const DofType &dof,bool bottom
    checkXFace(grid, dof, true, expectedValue);
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
-   checkZFace(grid, dof, !bottom, expectedValue);    
+   checkZFace(grid, dof, !bottom, expectedValue);
 }
 
 template<typename DofType,typename GridType>
@@ -484,11 +482,11 @@ void CheckCentralNode_Overlap(const GridType &grid, const DofType &dof,typename
    checkYFace(grid, dof, false, expectedValue);
    checkYFace(grid, dof, true, expectedValue);
    checkZFace(grid, dof, false, expectedValue);
-   checkZFace(grid, dof, true, expectedValue);    
+   checkZFace(grid, dof, true, expectedValue);
 }
 
 /*
-* Expected 27 processes. 
+* Expected 27 processes.
 */
 template<typename DofType,typename GridType>
 void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue)
@@ -499,7 +497,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena
    if(rank==1)
        CheckXEdgeNode_Overlap(grid,dof,true,true,expectedValue);
 
-   if(rank==2)    
+   if(rank==2)
        CheckConnerNode_Overlap(grid,dof,true,true,false,expectedValue);
 
    if(rank==3)
@@ -553,7 +551,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena
    if(rank==19)
        CheckXEdgeNode_Overlap(grid,dof,false,true,expectedValue);
 
-   if(rank==20)    
+   if(rank==20)
        CheckConnerNode_Overlap(grid,dof,false,true,false,expectedValue);
 
    if(rank==21)
@@ -590,19 +588,18 @@ void check_Inner_3D(int rank, const GridType& grid, const DofType& dof, typename
 
 
 /*
- * Light check of 3D distributed grid and its synchronization. 
+ * Light check of 3D distributed grid and its synchronization.
  * expected 27 processes
  */
-typedef MpiCommunicator CommunicatorType;
 typedef Grid<3,double,Host,int> GridType;
 typedef MeshFunctionView<GridType> MeshFunctionType;
 typedef Vector<double,Host,int> DofType;
 typedef typename GridType::Cell Cell;
-typedef typename GridType::IndexType IndexType; 
-typedef typename GridType::PointType PointType; 
+typedef typename GridType::IndexType IndexType;
+typedef typename GridType::PointType PointType;
 typedef DistributedMesh<GridType> DistributedGridType;
 using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >;
-     
+
 class DistributedGirdTest_3D : public ::testing::Test
 {
    protected:
@@ -620,14 +617,14 @@ class DistributedGirdTest_3D : public ::testing::Test
       Pointers::SharedPointer< LinearFunction<double,3>, Host > linearFunctionPtr;
 
       int rank;
-      int nproc;    
+      int nproc;
 
       void SetUp()
       {
 
          int size=10;
-         rank=CommunicatorType::GetRank(CommunicatorType::AllGroup);
-         nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup);
+         rank=TNL::MPI::GetRank();
+         nproc=TNL::MPI::GetSize();
 
          PointType globalOrigin;
          PointType globalProportions;
@@ -635,7 +632,7 @@ class DistributedGirdTest_3D : public ::testing::Test
 
          globalOrigin.x()=-0.5;
          globalOrigin.y()=-0.5;
-         globalOrigin.z()=-0.5;    
+         globalOrigin.z()=-0.5;
          globalProportions.x()=size;
          globalProportions.y()=size;
          globalProportions.z()=size;
@@ -645,17 +642,17 @@ class DistributedGirdTest_3D : public ::testing::Test
 
          distributedGrid=new DistributedGridType();
          distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3, 3 ) );
-         distributedGrid->template setGlobalGrid<CommunicatorType>( globalGrid );
-         distributedGrid->setupGrid(*gridptr);    
+         distributedGrid->setGlobalGrid( globalGrid );
+         distributedGrid->setupGrid(*gridptr);
          typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-         SubdomainOverlapsGetter< GridType, CommunicatorType >::
+         SubdomainOverlapsGetter< GridType >::
             getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 );
          distributedGrid->setOverlaps( lowerOverlap, upperOverlap );
 
          distributedGrid->setupGrid(*gridptr);
          dof=new DofType(gridptr->template getEntitiesCount< Cell >());
 
-         meshFunctionptr->bind(gridptr,*dof);   
+         meshFunctionptr->bind(gridptr,*dof);
          constFunctionPtr->Number=rank;
       }
 
@@ -697,17 +694,17 @@ TEST_F(DistributedGirdTest_3D, evaluateInteriorEntities)
     check_Boundary_3D(rank, *gridptr, *dof, -1);
     check_Overlap_3D(rank, *gridptr, *dof, -1);
     check_Inner_3D(rank, *gridptr, *dof, rank);
-}   
+}
 
 TEST_F(DistributedGirdTest_3D, LinearFunctionTest)
 {
-    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) 
+    //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid)
     setDof_3D(*dof,-1);
     linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr, linearFunctionPtr);
     Synchronizer synchronizer;
     synchronizer.setDistributedGrid( meshFunctionptr->getMesh().getDistributedMesh() );
-    synchronizer.template synchronize<CommunicatorType>( *meshFunctionptr );
-    
+    synchronizer.synchronize( *meshFunctionptr );
+
     int count =gridptr->template getEntitiesCount< Cell >();
     for(int i=0;i<count;i++)
     {
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index 7decaf5752680df4b59ac86350c152d525000697..a0eddd162f31bc8da213a5fced8778832f75a38a 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -17,8 +17,6 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/distributeSubentities.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/MPIPrint.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Writers/PVTUWriter.h>
 #include <TNL/Meshes/Readers/PVTUReader.h>
@@ -33,9 +31,6 @@ using namespace TNL::Meshes::DistributedMeshes;
 
 // cannot be deduced from the grid
 using LocalIndexType = short int;
-// we test only with MPI
-using CommunicatorType = Communicators::MpiCommunicator;
-using CommunicationGroup = typename CommunicatorType::CommunicationGroup;
 
 template< typename Mesh >
 struct GridDistributor;
@@ -55,9 +50,9 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > >
 
    GridDistributor() = delete;
 
-   GridDistributor( CoordinatesType rank_sizes, CommunicationGroup group )
-      : rank(CommunicatorType::GetRank(group)),
-        nproc(CommunicatorType::GetSize(group)),
+   GridDistributor( CoordinatesType rank_sizes, MPI_Comm group )
+      : rank(TNL::MPI::GetRank(group)),
+        nproc(TNL::MPI::GetSize(group)),
         rank_sizes(rank_sizes),
         group(group)
    {}
@@ -329,7 +324,7 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > >
    // input parameters
    int rank, nproc;
    CoordinatesType rank_sizes;
-   CommunicationGroup group;
+   MPI_Comm group;
    // output attributes (byproduct of the decomposition, useful for testing)
    CoordinatesType rank_coordinates, local_size, vert_begin, vert_end, cell_begin, cell_end;
    Index verticesCount, cellsCount, localVerticesCount, localCellsCount;
@@ -342,7 +337,7 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
    using Device = typename Mesh::DeviceType;
 
    // check basic interface
-   EXPECT_EQ( mesh.getCommunicationGroup(), CommunicatorType::AllGroup );
+   EXPECT_EQ( mesh.getCommunicationGroup(), TNL::MPI::AllGroup() );
    EXPECT_EQ( mesh.getGhostLevels(), ghostLevels );
    if( ghostLevels > 0 ) {
       EXPECT_EQ( mesh.template getGlobalIndices< 0 >().getSize(), mesh.getLocalMesh().template getEntitiesCount< 0 >() );
@@ -399,12 +394,12 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
          Containers::Array< Index, Device > vert_sendbuf( distributor.nproc ), cell_sendbuf( distributor.nproc );
          vert_sendbuf.setValue( distributor.localVerticesCount );
          cell_sendbuf.setValue( distributor.localCellsCount );
-         CommunicatorType::Alltoall( vert_sendbuf.getData(), 1,
-                                     vert_offsets.getData(), 1,
-                                     distributor.group );
-         CommunicatorType::Alltoall( cell_sendbuf.getData(), 1,
-                                     cell_offsets.getData(), 1,
-                                     distributor.group );
+         TNL::MPI::Alltoall( vert_sendbuf.getData(), 1,
+                             vert_offsets.getData(), 1,
+                             distributor.group );
+         TNL::MPI::Alltoall( cell_sendbuf.getData(), 1,
+                             cell_offsets.getData(), 1,
+                             distributor.group );
       }
       vert_offsets.setElement( distributor.nproc, 0 );
       cell_offsets.setElement( distributor.nproc, 0 );
@@ -662,7 +657,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh )
          if( received != center ) {
             IndexType cellIndexes[ 2 ] = {0, 0};
             const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes );
-            std::cerr << "rank " << CommunicatorType::GetRank()
+            std::cerr << "rank " << TNL::MPI::GetRank()
                       << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")"
                       << " of dimension = " << EntityType::getEntityDimension()
                       << ": received " << received << ", expected = " << center
@@ -672,7 +667,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh )
          }
       }
    if( errors > 0 )
-      FAIL() << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl;
+      FAIL() << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl;
 }
 
 template< typename Device, typename EntityType, typename MeshType >
@@ -704,10 +699,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel0 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 0;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -721,10 +716,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel1 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 1;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -739,10 +734,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel2 )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 2;
    distributor.decompose( grid, mesh, ghostLevels );
    validateMesh( mesh, distributor, ghostLevels );
@@ -757,10 +752,10 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    using Mesh = DistributedMesh< LocalMesh >;
    GridType grid;
    grid.setDomain( {0, 0}, {1, 1} );
-   const int nproc = CommunicatorType::GetSize();
+   const int nproc = TNL::MPI::GetSize();
    grid.setDimensions( nproc, nproc );
    Mesh mesh;
-   GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup );
+   GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() );
    const int ghostLevels = 2;
    distributor.decompose( grid, mesh, ghostLevels );
 
@@ -770,7 +765,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    std::string subfilePath;
    {
       std::ofstream file;
-      if( CommunicatorType::GetRank() == 0 )
+      if( TNL::MPI::GetRank() == 0 )
          file.open( mainFilePath );
       using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >;
       PVTU pvtu( file );
@@ -781,7 +776,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
          pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
          pvtu.template writePCellData< typename Mesh::GlobalIndexType >( "GlobalIndex" );
       }
-      subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() );
+      subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() );
 
       // create a .vtu file for local data
       using Writer = Meshes::Writers::VTUWriter< LocalMesh >;
@@ -799,7 +794,7 @@ TEST( DistributedMeshTest, PVTUWriterReader )
    }
 
    // load and test
-   CommunicatorType::Barrier();
+   TNL::MPI::Barrier();
    Readers::PVTUReader reader( mainFilePath );
    reader.detectMesh();
    EXPECT_EQ( reader.getMeshType(), "Meshes::DistributedMesh" );
@@ -813,8 +808,8 @@ TEST( DistributedMeshTest, PVTUWriterReader )
 
    // cleanup
    EXPECT_EQ( fs::remove( subfilePath ), true );
-   CommunicatorType::Barrier();
-   if( CommunicatorType::GetRank() == 0 ) {
+   TNL::MPI::Barrier();
+   if( TNL::MPI::GetRank() == 0 ) {
       EXPECT_EQ( fs::remove( mainFilePath ), true );
       EXPECT_EQ( fs::remove( baseName ), true );
    }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
index 0a5ab3e37da3c3f0360fad0525425df95d2d6c9e..9bdccbcdb7006c8f45f00865b2c3e6e60456095f 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
@@ -2,13 +2,8 @@
       #include <gtest/gtest.h>
 #ifdef HAVE_MPI
 
-#include <TNL/Communicators/MpiCommunicator.h>
 #include "DistributedVectorFieldIO_MPIIOTestBase.h"
 
-using namespace TNL::Communicators;
-
-typedef MpiCommunicator CommunicatorType;
-
 TEST( DistributedVectorFieldIO_MPIIO, Save_1D )
 {
     TestDistributedVectorFieldMPIIO<1,2,Host>::TestSave();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
index f35ec8e089621027f065ef764631494b87500982..d6791e1df9d27d9d89ef206c0d3be45288d80c3f 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h
@@ -1,4 +1,3 @@
-#include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunctionView.h>
 #include <TNL/Functions/VectorField.h>
@@ -17,13 +16,10 @@ using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
 using namespace TNL::Devices;
-using namespace TNL::Communicators;
 using namespace TNL::Meshes::DistributedMeshes;
 
 //------------------------------------------------------------------------------
 
-typedef MpiCommunicator CommunicatorType;
-
 template <int dim, int vctdim, typename Device>
 class TestDistributedVectorFieldMPIIO{
     public:
@@ -33,8 +29,8 @@ class TestDistributedVectorFieldMPIIO{
 	typedef VectorField<vctdim,MeshFunctionType> VectorFieldType;
     typedef Vector<double,Device,int> DofType;
     typedef typename MeshType::Cell Cell;
-    typedef typename MeshType::IndexType IndexType; 
-    typedef typename MeshType::PointType PointType; 
+    typedef typename MeshType::IndexType IndexType;
+    typedef typename MeshType::PointType PointType;
     typedef DistributedMesh<MeshType> DistributedGridType;
 
     typedef typename DistributedGridType::CoordinatesType CoordinatesType;
@@ -43,8 +39,8 @@ class TestDistributedVectorFieldMPIIO{
     static void TestSave()
     {
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
-        
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;
+
         //save distributed meshfunction into file
         PointType globalOrigin;
         globalOrigin.setValue(-0.5);
@@ -55,14 +51,14 @@ class TestDistributedVectorFieldMPIIO{
         Pointers::SharedPointer<MeshType> globalGrid;
         globalGrid->setDimensions(globalProportions);
         globalGrid->setDomain(globalOrigin,globalProportions);
-        
+
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>( *globalGrid );
+        distributedGrid.setGlobalGrid( *globalGrid );
 
-        Pointers::SharedPointer<MeshType> gridptr;        
+        Pointers::SharedPointer<MeshType> gridptr;
         distributedGrid.setupGrid(*gridptr);
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
         distributedGrid.setupGrid(*gridptr);
 
@@ -74,10 +70,10 @@ class TestDistributedVectorFieldMPIIO{
         DofType dof(vctdim*(gridptr->template getEntitiesCount< Cell >()));
         dof.setValue(0);
         vectorField.bind(gridptr,dof);
-            
+
 		for(int i=0;i<vctdim;i++)
 	        linearFunctionEvaluator.evaluateAllEntities(vectorField [ i ], linearFunctionPtr);
- 
+
         String FileName=String("/tmp/test-file.tnl");
         DistributedGridIO_VectorField<VectorFieldType,MpiIO> ::save(FileName, vectorField );
         /*File file;
@@ -86,7 +82,7 @@ class TestDistributedVectorFieldMPIIO{
 		file.close();		*/
 
        //first process compare results
-       if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+       if(TNL::MPI::GetRank()==0)
        {
             DofType globalEvaluatedDof(vctdim*(globalGrid->template getEntitiesCount< Cell >()));
 
@@ -101,7 +97,7 @@ class TestDistributedVectorFieldMPIIO{
             loadvct.bind(globalGrid,loadDof);
 
             loadDof.setValue(-1);
-        
+
             File file;
             file.open( FileName, std::ios_base::in );
 	    loadvct.boundLoad(file);
@@ -111,13 +107,13 @@ class TestDistributedVectorFieldMPIIO{
 	    }
        }
     };
-    
+
     static void TestLoad()
     {
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;
 
-        //Crete distributed grid            
+        //Crete distributed grid
         PointType globalOrigin;
         globalOrigin.setValue(-0.5);
 
@@ -131,26 +127,26 @@ class TestDistributedVectorFieldMPIIO{
         CoordinatesType overlap;
         overlap.setValue(1);
         DistributedGridType distributedGrid;
-        distributedGrid.template setGlobalGrid<CommunicatorType>(*globalGrid);
+        distributedGrid.setGlobalGrid(*globalGrid);
         typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap;
-        SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
+        SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
 
-        String FileName=String("/tmp/test-file.tnl");         
+        String FileName=String("/tmp/test-file.tnl");
 
-        //Prepare file   
-        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
-        {   
+        //Prepare file
+        if(TNL::MPI::GetRank()==0)
+        {
             DofType saveDof(vctdim*(globalGrid->template getEntitiesCount< Cell >()));
 
             VectorFieldType saveVectorField;
             saveVectorField.bind(globalGrid,saveDof);
             for(int i=0;i<vctdim;i++)
                 linearFunctionEvaluator.evaluateAllEntities(saveVectorField[i] , linearFunctionPtr);
-      
+
             File file;
-            file.open( FileName, std::ios_base::out );        
+            file.open( FileName, std::ios_base::out );
             saveVectorField.save(file);
             file.close();
         }
@@ -158,7 +154,7 @@ class TestDistributedVectorFieldMPIIO{
         Pointers::SharedPointer<MeshType> loadGridptr;
         VectorFieldType loadVectorField;
         distributedGrid.setupGrid(*loadGridptr);
-        
+
         DofType loadDof(vctdim*(loadGridptr->template getEntitiesCount< Cell >()));
         loadDof.setValue(0);
         loadVectorField.bind(loadGridptr,loadDof);
@@ -169,26 +165,26 @@ class TestDistributedVectorFieldMPIIO{
         synchronizer.setDistributedGrid( &distributedGrid );
 
         for(int i=0;i<vctdim;i++)
-            synchronizer.template synchronize<CommunicatorType>(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof
+            synchronizer.synchronize(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof
 
         Pointers::SharedPointer<MeshType> evalGridPtr;
         VectorFieldType evalVectorField;
         distributedGrid.setupGrid(*evalGridPtr);
-        
+
         DofType evalDof(vctdim*(evalGridPtr->template getEntitiesCount< Cell >()));
         evalDof.setValue(-1);
         evalVectorField.bind(evalGridPtr,evalDof);
-        
+
         for(int i=0;i<vctdim;i++)
         {
-            linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr);        
-            synchronizer.template synchronize<CommunicatorType>(*evalVectorField[i]);
+            linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr);
+            synchronizer.synchronize(*evalVectorField[i]);
         }
 
         for(int i=0;i<evalDof.getSize();i++)
         {
             EXPECT_EQ( evalDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
-        
+
     }
 };
diff --git a/src/UnitTests/main_mpi.h b/src/UnitTests/main_mpi.h
index 0f8f4b059a119fb42d5bbb78be01540a50cbdf9b..d22f6d3ebb0b9c9446a0ebedb99eaf5d9c34b24f 100644
--- a/src/UnitTests/main_mpi.h
+++ b/src/UnitTests/main_mpi.h
@@ -6,9 +6,8 @@
 #endif
 
 #if (defined(HAVE_GTEST) && defined(HAVE_MPI))
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Communicators/ScopedInitializer.h>
-using CommunicatorType = TNL::Communicators::MpiCommunicator;
+#include <TNL/MPI/ScopedInitializer.h>
+#include <TNL/MPI/Wrappers.h>
 
 #include <sstream>
 
@@ -37,7 +36,7 @@ public:
    // Called after a test ends.
    virtual void OnTestEnd(const ::testing::TestInfo& test_info)
    {
-      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      const int rank = TNL::MPI::GetRank();
       sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
       std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
       sout.str( std::string() );
@@ -58,7 +57,7 @@ int main( int argc, char* argv[] )
       delete listeners.Release(listeners.default_result_printer());
       listeners.Append(new MinimalistBufferedPrinter);
 
-      TNL::Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+      TNL::MPI::ScopedInitializer mpi(argc, argv);
    #endif
    return RUN_ALL_TESTS();
 #else