diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b4a98a9e1285fdbde4b2cefff816fca819bf9547..157374d4c7c1224768332810c40103f6aea4646c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,14 +18,24 @@ stages:
     WITH_CUDA: "no"
     WITH_CUDA_ARCH: "auto"
     WITH_MIC: "no"
+    WITH_MPI: "no"
     WITH_TESTS: "yes"
     WITH_COVERAGE: "no"
-    WITH_EXAMPLES: "yes"
+    # these are built only in the "full" config
+    WITH_BENCHMARKS: "no"
+    WITH_EXAMPLES: "no"
+    WITH_TOOLS: "no"
+    WITH_PYTHON: "no"
 
 # template for build jobs
 .build_template_def: &build_template
     stage: build
     script:
+        # set MPI compiler wrapper
+        - if [[ ${WITH_MPI} == "yes" ]]; then
+                export CXX=mpicxx;
+                export CC=mpicc;
+          fi
         # all cores including hyperthreading
 #        - export NUM_CORES=$(grep "core id" /proc/cpuinfo | wc -l)
 #       # all pyhsical cores
@@ -46,7 +56,10 @@ stages:
                 -DWITH_MIC=${WITH_MIC}
                 -DWITH_TESTS=${WITH_TESTS}
                 -DWITH_COVERAGE=${WITH_COVERAGE}
+                -DWITH_BENCHMARKS=${WITH_BENCHMARKS}
                 -DWITH_EXAMPLES=${WITH_EXAMPLES}
+                -DWITH_TOOLS=${WITH_TOOLS}
+                -DWITH_PYTHON=${WITH_PYTHON}
 #        - make
 #        - make test
 #        - make install
@@ -61,7 +74,7 @@ stages:
 # Cuda builds are specified first because they take more time than host-only builds,
 # which can be allocated on hosts whitout GPUs.
 
-cuda_Debug:
+cuda_base_Debug:
     <<: *build_template
     tags:
         - gpu
@@ -70,7 +83,7 @@ cuda_Debug:
         WITH_CUDA: "yes"
         BUILD_TYPE: Debug
 
-cuda_Release:
+cuda_base_Release:
     <<: *build_template
     tags:
         - gpu
@@ -79,38 +92,94 @@ cuda_Release:
         WITH_CUDA: "yes"
         BUILD_TYPE: Release
 
-cuda+openmp_Debug:
+cuda_mpi_Debug:
     <<: *build_template
     tags:
         - openmp
         - gpu
+        - mpi
     variables:
         <<: *default_cmake_flags
         WITH_OPENMP: "yes"
         WITH_CUDA: "yes"
+        WITH_MPI: "yes"
         BUILD_TYPE: Debug
 
-cuda+openmp_Release:
+cuda_mpi_Release:
     <<: *build_template
     tags:
         - openmp
         - gpu
+        - mpi
     variables:
         <<: *default_cmake_flags
         WITH_OPENMP: "yes"
         WITH_CUDA: "yes"
+        WITH_MPI: "yes"
         BUILD_TYPE: Release
 
-default_Debug:
+cuda_full_Debug:
     <<: *build_template
+    tags:
+        - openmp
+        - gpu
+    variables:
+        <<: *default_cmake_flags
+        WITH_OPENMP: "yes"
+        WITH_CUDA: "yes"
+        BUILD_TYPE: Debug
+        WITH_BENCHMARKS: "yes"
+        WITH_EXAMPLES: "yes"
+        WITH_TOOLS: "yes"
+        WITH_PYTHON: "yes"
+
+cuda_full_Release:
+    <<: *build_template
+    tags:
+        - openmp
+        - gpu
+    variables:
+        <<: *default_cmake_flags
+        WITH_OPENMP: "yes"
+        WITH_CUDA: "yes"
+        BUILD_TYPE: Release
+        WITH_BENCHMARKS: "yes"
+        WITH_EXAMPLES: "yes"
+        WITH_TOOLS: "yes"
+        WITH_PYTHON: "yes"
 
-default_Release:
+default_base_Debug:
     <<: *build_template
+
+default_base_Release:
+    <<: *build_template
+    variables:
+        <<: *default_cmake_flags
+        BUILD_TYPE: Release
+
+default_mpi_Debug:
+    <<: *build_template
+    tags:
+        - openmp
+        - mpi
     variables:
         <<: *default_cmake_flags
+        WITH_OPENMP: "yes"
+        WITH_MPI: "yes"
+        BUILD_TYPE: Debug
+
+default_mpi_Release:
+    <<: *build_template
+    tags:
+        - openmp
+        - mpi
+    variables:
+        <<: *default_cmake_flags
+        WITH_OPENMP: "yes"
+        WITH_MPI: "yes"
         BUILD_TYPE: Release
 
-openmp_Debug:
+default_full_Debug:
     <<: *build_template
     tags:
         - openmp
@@ -118,8 +187,12 @@ openmp_Debug:
         <<: *default_cmake_flags
         WITH_OPENMP: "yes"
         BUILD_TYPE: Debug
+        WITH_BENCHMARKS: "yes"
+        WITH_EXAMPLES: "yes"
+        WITH_TOOLS: "yes"
+        WITH_PYTHON: "yes"
 
-openmp_Release:
+default_full_Release:
     <<: *build_template
     tags:
         - openmp
@@ -127,3 +200,7 @@ openmp_Release:
         <<: *default_cmake_flags
         WITH_OPENMP: "yes"
         BUILD_TYPE: Release
+        WITH_BENCHMARKS: "yes"
+        WITH_EXAMPLES: "yes"
+        WITH_TOOLS: "yes"
+        WITH_PYTHON: "yes"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a8b1543fdc04a27aec514676a66beb81f9ebc12..8ed064eb1de0ebce012daf328173b420ada0f0a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ option(WITH_TESTS "Build tests" ON)
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
 option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
+option(WITH_BENCHMARKS "Compile the 'src/Benchmarks' directory" ON)
 option(WITH_PYTHON "Compile the Python bindings" ON)
 option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF)
 
@@ -62,6 +63,20 @@ else()
     set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/bin )
 endif()
 
+# check if the compiler is good enough
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   # GCC 5.0 is the first release with full C++11 support (due to libstdc++)
+   # https://gcc.gnu.org/gcc-5/changes.html
+   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0")
+      message(FATAL_ERROR "Insufficient GCC version")
+   endif()
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+   # Clang 3.4 has full C++14 support: http://clang.llvm.org/cxx_status.html
+   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "3.4")
+      message(FATAL_ERROR "Insufficient Clang version")
+   endif()
+endif()
+
 # set Debug/Release options
 set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
@@ -109,10 +124,11 @@ endif()
 #####
 # Check for MPI -- poznej podle vraperu compileru -- da se testovat preklad bez MPI
 #
-if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" )
+if( ${CXX_COMPILER_NAME} STREQUAL "mpicxx" )
    message( "MPI compiler detected."    )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
-   set( CUDA_HOST_COMPILER "mpic++" )
+   set( CUDA_HOST_COMPILER "mpicxx" )
+   set( BUILD_MPI ON )
 endif()
 
 ####
@@ -150,7 +166,7 @@ if( ${WITH_CUDA} )
             endif()
         endif()
         # An extra CUDA_ARCH_HOST_COMPILER variable for compiling tnl-cuda-arch alone,
-        # because it SHOULD NOT be compiled using mpic++, which would cause weird
+        # because it SHOULD NOT be compiled using mpicxx, which would cause weird
         # RPATH_CHANGE error in cmake.
         # FIXME: find better solution to switch between MPI-enabled and MPI-disabled binaries in cmake
         if( NOT $ENV{CUDA_ARCH_HOST_COMPILER} STREQUAL "" )
@@ -406,15 +422,9 @@ INCLUDE_DIRECTORIES( ${PROJECT_BUILD_PATH} )
 LINK_DIRECTORIES( ${LIBRARY_OUTPUT_PATH} )
 
 # Add all subdirectories
-# Note that it is important to start building examples as soon as possible,
-# because they take the longest time and other stuff can be pipelined before
-# they are finished (at least with Ninja).
-if( WITH_EXAMPLES STREQUAL "yes" )
-   add_subdirectory( examples )
-endif( WITH_EXAMPLES STREQUAL "yes" )
-if( WITH_TESTS STREQUAL "yes" )
+if( ${WITH_TESTS} )
     add_subdirectory( tests )
-endif( WITH_TESTS STREQUAL "yes" )
+endif()
 add_subdirectory( src )
 add_subdirectory( share )
 
@@ -458,6 +468,7 @@ message( "   WITH_TESTS=${WITH_TESTS}" )
 message( "   WITH_COVERAGE=${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES=${WITH_EXAMPLES}" )
 message( "   WITH_TOOLS=${WITH_TOOLS}" )
+message( "   WITH_BENCHMARKS=${WITH_BENCHMARKS}" )
 message( "   WITH_PYTHON=${WITH_PYTHON}" )
 message( "   WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" )
 # Print compiler options
diff --git a/build b/build
index 69f01cb406b6d912f0241738db56d1370f1e6105..e0c8dbb993e592c420aa62abd991c189fbff4870 100755
--- a/build
+++ b/build
@@ -26,6 +26,7 @@ WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
 WITH_PYTHON="yes"
 WITH_TOOLS="yes"
+WITH_BENCHMARKS="yes"
 
 WITH_TEMPLATE_INSTANTIATION="no"
 INSTANTIATE_LONG_INT="no"
@@ -60,6 +61,7 @@ do
         --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
         --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
         --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
+        --with-benchmarks=*              ) WITH_BENCHMARKS="${option#*=}" ;;
         --with-python=*                  ) WITH_PYTHON="${option#*=}" ;;
         --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
         --instantiate-long-int=*         ) INSTANTIATE_LONG_INT="${option#*=}" ;;
@@ -114,17 +116,18 @@ if [[ ${WITH_CLANG} == "yes" ]]; then
 fi
 
 if [[ ${WITH_MPI} == "yes" ]]; then
-    if [[ ! -x  "$(command -v mpic++)" ]]; then
-       echo "Warning:mpic++ is not installed on this system. MPI support is turned off." 
+    # NOTE: OpenMPI provides mpic++, but Intel MPI does not
+    if [[ ! -x  "$(command -v mpicxx)" ]]; then
+       echo "Warning: mpicxx is not installed on this system. MPI support is turned off."
     else
        # instruct OpenMPI to use the original compiler
        # reference: https://www.open-mpi.org/faq/?category=mpi-apps#override-wrappers-after-v1.0
-       # FIXME: this does not work with CUDA_HOST_COMPILER=mpic++
+       # FIXME: this does not work with CUDA_HOST_COMPILER=mpicxx
 #       if [ -n "$CXX" ]; then
 #          export OMPI_CXX="$CXX"
 #       fi
-       export CXX=mpic++
-       export CUDA_HOST_COMPILER=mpic++
+       export CXX=mpicxx
+       export CUDA_HOST_COMPILER=mpicxx
     fi
     if [[ ! -x  "$(command -v mpicc)" ]]; then
        echo "Warning: mpicc is not installed on this system." 
@@ -163,6 +166,7 @@ cmake_command=(
          -DWITH_COVERAGE=${WITH_COVERAGE}
          -DWITH_EXAMPLES=${WITH_EXAMPLES}
          -DWITH_TOOLS=${WITH_TOOLS}
+         -DWITH_BENCHMARKS=${WITH_BENCHMARKS}
          -DWITH_PYTHON=${WITH_PYTHON}
          -DDCMTK_DIR=${DCMTK_DIR}
          -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION}
diff --git a/share/CMakeLists.txt b/share/CMakeLists.txt
index 3559626d0dd91df5cdf0affcc0cea967bd5a2e21..2a6f286925ef2a17b6cd7adc902fdefbc930fb39 100644
--- a/share/CMakeLists.txt
+++ b/share/CMakeLists.txt
@@ -1 +1,2 @@
-add_subdirectory( Tools )
+add_subdirectory (cmake)
+add_subdirectory (pkgconfig)
diff --git a/share/Tools/CMakeLists.txt b/share/Tools/CMakeLists.txt
deleted file mode 100644
index 2a6f286925ef2a17b6cd7adc902fdefbc930fb39..0000000000000000000000000000000000000000
--- a/share/Tools/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_subdirectory (cmake)
-add_subdirectory (pkgconfig)
diff --git a/share/Tools/cmake/CMakeFindTNL.cmake b/share/cmake/CMakeFindTNL.cmake
similarity index 100%
rename from share/Tools/cmake/CMakeFindTNL.cmake
rename to share/cmake/CMakeFindTNL.cmake
diff --git a/share/Tools/cmake/CMakeLists.txt b/share/cmake/CMakeLists.txt
similarity index 100%
rename from share/Tools/cmake/CMakeLists.txt
rename to share/cmake/CMakeLists.txt
diff --git a/share/Tools/pkgconfig/CMakeLists.txt b/share/pkgconfig/CMakeLists.txt
similarity index 100%
rename from share/Tools/pkgconfig/CMakeLists.txt
rename to share/pkgconfig/CMakeLists.txt
diff --git a/share/Tools/pkgconfig/cuda.pc.in b/share/pkgconfig/cuda.pc.in
similarity index 100%
rename from share/Tools/pkgconfig/cuda.pc.in
rename to share/pkgconfig/cuda.pc.in
diff --git a/share/Tools/pkgconfig/tnl-cuda.pc.in b/share/pkgconfig/tnl-cuda.pc.in
similarity index 100%
rename from share/Tools/pkgconfig/tnl-cuda.pc.in
rename to share/pkgconfig/tnl-cuda.pc.in
diff --git a/share/Tools/pkgconfig/tnl-openmp.pc.in b/share/pkgconfig/tnl-openmp.pc.in
similarity index 100%
rename from share/Tools/pkgconfig/tnl-openmp.pc.in
rename to share/pkgconfig/tnl-openmp.pc.in
diff --git a/share/Tools/pkgconfig/tnl.pc.in b/share/pkgconfig/tnl.pc.in
similarity index 100%
rename from share/Tools/pkgconfig/tnl.pc.in
rename to share/pkgconfig/tnl.pc.in
diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc396c46e60512b5cab5f5f7455756f609b76c47
--- /dev/null
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -0,0 +1,10 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu )
+    CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
+endif()
+
+install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..aacdb9cc65315af377ddec49c167a9a197483bd5
--- /dev/null
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -0,0 +1,165 @@
+/***************************************************************************
+                          array-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkArrayOperations( Benchmark & benchmark,
+                          const int & loops,
+                          const long & size )
+{
+   typedef Containers::Array< Real, Devices::Host, Index > HostArray;
+   typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
+   using namespace std;
+
+   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+
+   HostArray hostArray, hostArray2;
+   CudaArray deviceArray, deviceArray2;
+   hostArray.setSize( size );
+   hostArray2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceArray.setSize( size );
+   deviceArray2.setSize( size );
+#endif
+
+   Real resultHost, resultDevice;
+
+
+   // reset functions
+   auto reset1 = [&]() {
+      hostArray.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceArray.setValue( 1.0 );
+#endif
+   };
+   auto reset2 = [&]() {
+      hostArray2.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceArray2.setValue( 1.0 );
+#endif
+   };
+   auto reset12 = [&]() {
+      reset1();
+      reset2();
+   };
+
+
+   reset12();
+
+
+   auto compareHost = [&]() {
+      resultHost = (int) hostArray == hostArray2;
+   };
+   auto compareCuda = [&]() {
+      resultDevice = (int) deviceArray == deviceArray2;
+   };
+   benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", compareHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", compareCuda );
+#endif
+
+
+   auto copyAssignHostHost = [&]() {
+      hostArray = hostArray2;
+   };
+   auto copyAssignCudaCuda = [&]() {
+      deviceArray = deviceArray2;
+   };
+   benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
+   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
+   // complain when compiling without CUDA
+   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+#endif
+
+
+   auto copyAssignHostCuda = [&]() {
+      deviceArray = hostArray;
+   };
+   auto copyAssignCudaHost = [&]() {
+      hostArray = deviceArray;
+   };
+#ifdef HAVE_CUDA
+   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
+   benchmark.time( reset1,
+                   "CPU->GPU", copyAssignHostCuda,
+                   "GPU->CPU", copyAssignCudaHost );
+#endif
+
+
+   auto setValueHost = [&]() {
+      hostArray.setValue( 3.0 );
+   };
+   auto setValueCuda = [&]() {
+      deviceArray.setValue( 3.0 );
+   };
+   benchmark.setOperation( "setValue", datasetSize );
+   benchmark.time( reset1, "CPU", setValueHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", setValueCuda );
+#endif
+
+
+   auto setSizeHost = [&]() {
+      hostArray.setSize( size );
+   };
+   auto setSizeCuda = [&]() {
+      deviceArray.setSize( size );
+   };
+   auto resetSize1 = [&]() {
+      hostArray.reset();
+#ifdef HAVE_CUDA
+      deviceArray.reset();
+#endif
+   };
+   benchmark.setOperation( "allocation (setSize)", datasetSize );
+   benchmark.time( resetSize1, "CPU", setSizeHost );
+#ifdef HAVE_CUDA
+   benchmark.time( resetSize1, "GPU", setSizeCuda );
+#endif
+
+
+   auto resetSizeHost = [&]() {
+      hostArray.reset();
+   };
+   auto resetSizeCuda = [&]() {
+      deviceArray.reset();
+   };
+   auto setSize1 = [&]() {
+      hostArray.setSize( size );
+#ifdef HAVE_CUDA
+      deviceArray.setSize( size );
+#endif
+   };
+   benchmark.setOperation( "deallocation (reset)", datasetSize );
+   benchmark.time( setSize1, "CPU", resetSizeHost );
+#ifdef HAVE_CUDA
+   benchmark.time( setSize1, "GPU", resetSizeCuda );
+#endif
+
+   return true;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/tests/benchmarks/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h
similarity index 74%
rename from tests/benchmarks/cublasWrappers.h
rename to src/Benchmarks/BLAS/cublasWrappers.h
index 6b71a4ed7befc12664440cf5425f5975e6feec0c..1e63e139d6faa513706ed1b18a207e87ea1a079d 100644
--- a/tests/benchmarks/cublasWrappers.h
+++ b/src/Benchmarks/BLAS/cublasWrappers.h
@@ -8,14 +8,14 @@ inline cublasStatus_t
 cublasIgamax( cublasHandle_t handle, int n,
               const float           *x, int incx, int *result )
 {
-    return cublasIsamax( handle, n, x, incx, result );
+   return cublasIsamax( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasIgamax( cublasHandle_t handle, int n,
               const double          *x, int incx, int *result )
 {
-    return cublasIdamax( handle, n, x, incx, result );
+   return cublasIdamax( handle, n, x, incx, result );
 }
 
 
@@ -23,14 +23,14 @@ inline cublasStatus_t
 cublasIgamin( cublasHandle_t handle, int n,
               const float           *x, int incx, int *result )
 {
-    return cublasIsamin( handle, n, x, incx, result );
+   return cublasIsamin( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasIgamin( cublasHandle_t handle, int n,
               const double          *x, int incx, int *result )
 {
-    return cublasIdamin( handle, n, x, incx, result );
+   return cublasIdamin( handle, n, x, incx, result );
 }
 
 
@@ -38,14 +38,14 @@ inline cublasStatus_t
 cublasGasum( cublasHandle_t handle, int n,
              const float           *x, int incx, float  *result )
 {
-    return cublasSasum( handle, n, x, incx, result );
+   return cublasSasum( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasGasum( cublasHandle_t handle, int n,
              const double          *x, int incx, double *result )
 {
-    return cublasDasum( handle, n, x, incx, result );
+   return cublasDasum( handle, n, x, incx, result );
 }
 
 
@@ -55,7 +55,7 @@ cublasGaxpy( cublasHandle_t handle, int n,
              const float           *x, int incx,
              float                 *y, int incy )
 {
-    return cublasSaxpy( handle, n, alpha, x, incx, y, incy );
+   return cublasSaxpy( handle, n, alpha, x, incx, y, incy );
 }
 
 inline cublasStatus_t
@@ -64,7 +64,7 @@ cublasGaxpy( cublasHandle_t handle, int n,
              const double          *x, int incx,
              double                *y, int incy )
 {
-    return cublasDaxpy( handle, n, alpha, x, incx, y, incy );
+   return cublasDaxpy( handle, n, alpha, x, incx, y, incy );
 }
 
 
@@ -74,7 +74,7 @@ cublasGdot( cublasHandle_t handle, int n,
             const float        *y, int incy,
             float         *result )
 {
-    return cublasSdot( handle, n, x, incx, y, incy, result );
+   return cublasSdot( handle, n, x, incx, y, incy, result );
 }
 
 inline cublasStatus_t
@@ -83,7 +83,7 @@ cublasGdot( cublasHandle_t handle, int n,
             const double       *y, int incy,
             double        *result )
 {
-    return cublasDdot( handle, n, x, incx, y, incy, result );
+   return cublasDdot( handle, n, x, incx, y, incy, result );
 }
 
 
@@ -91,14 +91,14 @@ inline cublasStatus_t
 cublasGnrm2( cublasHandle_t handle, int n,
              const float           *x, int incx, float  *result )
 {
-    return cublasSnrm2( handle, n, x, incx, result );
+   return cublasSnrm2( handle, n, x, incx, result );
 }
 
 inline cublasStatus_t
 cublasGnrm2( cublasHandle_t handle, int n,
              const double          *x, int incx, double *result )
 {
-    return cublasDnrm2( handle, n, x, incx, result );
+   return cublasDnrm2( handle, n, x, incx, result );
 }
 
 
@@ -107,7 +107,7 @@ cublasGscal( cublasHandle_t handle, int n,
              const float           *alpha,
              float           *x, int incx )
 {
-    return cublasSscal( handle, n, alpha, x, incx );
+   return cublasSscal( handle, n, alpha, x, incx );
 }
 
 inline cublasStatus_t
@@ -115,7 +115,7 @@ cublasGscal( cublasHandle_t handle, int n,
              const double          *alpha,
              double          *x, int incx )
 {
-    return cublasDscal( handle, n, alpha, x, incx );
+   return cublasDscal( handle, n, alpha, x, incx );
 }
 
 #endif
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6840af9f8c1f1dedeee876fd2bca258a55c64ba
--- /dev/null
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -0,0 +1,189 @@
+/***************************************************************************
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/List.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/Ellpack.h>
+#include <TNL/Matrices/SlicedEllpack.h>
+#include <TNL/Matrices/ChunkedEllpack.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+// silly alias to match the number of template parameters with other formats
+template< typename Real, typename Device, typename Index >
+using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
+
+template< typename Matrix >
+int setHostTestMatrix( Matrix& matrix,
+                       const int elementsPerRow )
+{
+   const int size = matrix.getRows();
+   int elements( 0 );
+   for( int row = 0; row < size; row++ ) {
+      int col = row - elementsPerRow / 2;
+      for( int element = 0; element < elementsPerRow; element++ ) {
+         if( col + element >= 0 &&
+            col + element < size )
+         {
+            matrix.setElement( row, col + element, element + 1 );
+            elements++;
+         }
+      }
+   }
+   return elements;
+}
+
+#ifdef HAVE_CUDA
+template< typename Matrix >
+__global__ void setCudaTestMatrixKernel( Matrix* matrix,
+                                         const int elementsPerRow,
+                                         const int gridIdx )
+{
+   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx >= matrix->getRows() )
+      return;
+   int col = rowIdx - elementsPerRow / 2;
+   for( int element = 0; element < elementsPerRow; element++ ) {
+      if( col + element >= 0 &&
+         col + element < matrix->getColumns() )
+         matrix->setElementFast( rowIdx, col + element, element + 1 );
+   }
+}
+#endif
+
+template< typename Matrix >
+void setCudaTestMatrix( Matrix& matrix,
+                        const int elementsPerRow )
+{
+#ifdef HAVE_CUDA
+   typedef typename Matrix::IndexType IndexType;
+   typedef typename Matrix::RealType RealType;
+   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
+      if( gridIdx == cudaGrids - 1 )
+         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+      setCudaTestMatrixKernel< Matrix >
+         <<< cudaGridSize, cudaBlockSize >>>
+         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
+        TNL_CHECK_CUDA_DEVICE;
+   }
+#endif
+}
+
+
+// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
+template< typename Real,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename > class Vector = Containers::Vector >
+bool
+benchmarkSpMV( Benchmark & benchmark,
+               const int & loops,
+               const int & size,
+               const int elementsPerRow = 5 )
+{
+   typedef Matrix< Real, Devices::Host, int > HostMatrix;
+   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
+   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
+
+   HostMatrix hostMatrix;
+   DeviceMatrix deviceMatrix;
+   Containers::Vector< int, Devices::Host, int > hostRowLengths;
+   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+
+   // create benchmark group
+   Containers::List< String > parsedType;
+   parseObjectType( HostMatrix::getType(), parsedType );
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+
+   hostRowLengths.setSize( size );
+   hostMatrix.setDimensions( size, size );
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setSize( size );
+   deviceMatrix.setDimensions( size, size );
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+#endif
+
+   hostRowLengths.setValue( elementsPerRow );
+#ifdef HAVE_CUDA
+   deviceRowLengths.setValue( elementsPerRow );
+#endif
+
+   hostMatrix.setCompressedRowLengths( hostRowLengths );
+#ifdef HAVE_CUDA
+   deviceMatrix.setCompressedRowLengths( deviceRowLengths );
+#endif
+
+   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
+   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
+   const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      hostVector.setValue( 1.0 );
+      hostVector2.setValue( 0.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+      deviceVector2.setValue( 0.0 );
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   };
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
+
+   benchmark.setOperation( datasetSize );
+   benchmark.time( reset, "CPU", spmvHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset, "GPU", spmvCuda );
+#endif
+
+   return true;
+}
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkSpmvSynthetic( Benchmark & benchmark,
+                        const int & loops,
+                        const int & size,
+                        const int & elementsPerRow )
+{
+   bool result = true;
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow );
+   result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow );
+   return result;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/tests/benchmarks/tnl-benchmark-blas.cpp b/src/Benchmarks/BLAS/tnl-benchmark-blas.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-blas.cpp
rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cpp
diff --git a/tests/benchmarks/tnl-benchmark-blas.cu b/src/Benchmarks/BLAS/tnl-benchmark-blas.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-blas.cu
rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cu
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..73ea0b375a23440105ea6dfbae599c63fb108adb
--- /dev/null
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -0,0 +1,192 @@
+/***************************************************************************
+                          tnl-benchmark-blas.h  -  description
+                             -------------------
+    begin                : Jan 27, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Devices/SystemInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/Config/ParameterContainer.h>
+
+#include "array-operations.h"
+#include "vector-operations.h"
+#include "spmv.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+
+
+// TODO: should benchmarks check the result of the computation?
+
+
+template< typename Real >
+void
+runBlasBenchmarks( Benchmark & benchmark,
+                   Benchmark::MetadataMap metadata,
+                   const std::size_t & minSize,
+                   const std::size_t & maxSize,
+                   const double & sizeStepFactor,
+                   const unsigned & loops,
+                   const unsigned & elementsPerRow )
+{
+   const String precision = getType< Real >();
+   metadata["precision"] = precision;
+
+   // Array operations
+   benchmark.newBenchmark( String("Array operations (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", size},
+      } ));
+      benchmarkArrayOperations< Real >( benchmark, loops, size );
+   }
+
+   // Vector operations
+   benchmark.newBenchmark( String("Vector operations (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"size", size},
+      } ));
+      benchmarkVectorOperations< Real >( benchmark, loops, size );
+   }
+
+   // Sparse matrix-vector multiplication
+   benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         {"rows", size},
+         {"columns", size},
+         {"elements per row", elementsPerRow},
+      } ));
+      benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow );
+   }
+}
+
+void
+setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
+   config.addEntryEnum( "float" );
+   config.addEntryEnum( "double" );
+   config.addEntryEnum( "all" );
+   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 );
+   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
+   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int
+main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return 1;
+   }
+
+   Devices::Host::setup( parameters );
+   Devices::Cuda::setup( parameters );
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const String & precision = parameters.getParameter< String >( "precision" );
+   // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+   // which have a default value. The workaround below works for int values, but it is not possible
+   // to pass 64-bit integer values
+//   const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+//   const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+   const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+   const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
+   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
+   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
+   const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" );
+   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
+
+   if( sizeStepFactor <= 1 ) {
+       std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl;
+       return EXIT_FAILURE;
+   }
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   const int cpu_id = 0;
+   Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
+   String cacheInfo = String( cacheSizes.L1data ) + ", "
+                       + String( cacheSizes.L1instruction ) + ", "
+                       + String( cacheSizes.L2 ) + ", "
+                       + String( cacheSizes.L3 );
+#ifdef HAVE_CUDA
+   const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
+   const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
+                             String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+#endif
+   Benchmark::MetadataMap metadata {
+      { "host name", Devices::SystemInfo::getHostname() },
+      { "architecture", Devices::SystemInfo::getArchitecture() },
+      { "system", Devices::SystemInfo::getSystemName() },
+      { "system release", Devices::SystemInfo::getSystemRelease() },
+      { "start time", Devices::SystemInfo::getCurrentTime() },
+      { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
+      { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+      { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
+      { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
+      { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
+#ifdef HAVE_CUDA
+      { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
+      { "GPU architecture", deviceArch },
+      { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
+      { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
+      { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
+      { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
+      { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
+#endif
+   };
+
+   if( precision == "all" || precision == "float" )
+      runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+   if( precision == "all" || precision == "double" )
+      runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..e65f8980b1066e042206e328d15b50e32c81432f
--- /dev/null
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -0,0 +1,440 @@
+/***************************************************************************
+                          vector-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <stdlib.h> // srand48
+
+#include "../Benchmarks.h"
+
+#include <TNL/Containers/Vector.h>
+
+#ifdef HAVE_CUDA
+#include "cublasWrappers.h"
+#endif
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Real = double,
+          typename Index = int >
+bool
+benchmarkVectorOperations( Benchmark & benchmark,
+                           const int & loops,
+                           const long & size )
+{
+   typedef Containers::Vector< Real, Devices::Host, Index > HostVector;
+   typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector;
+   using namespace std;
+
+   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+   hostVector.setSize( size );
+   hostVector2.setSize( size );
+#ifdef HAVE_CUDA
+   deviceVector.setSize( size );
+   deviceVector2.setSize( size );
+#endif
+
+   Real resultHost, resultDevice;
+
+#ifdef HAVE_CUDA
+   cublasHandle_t cublasHandle;
+   cublasCreate( &cublasHandle );
+#endif
+
+
+   // reset functions
+   // (Make sure to always use some in benchmarks, even if it's not necessary
+   // to assure correct result - it helps to clear cache and avoid optimizations
+   // of the benchmark loop.)
+   auto reset1 = [&]() {
+      hostVector.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceVector.setValue( 1.0 );
+#endif
+      // A relatively harmless call to keep the compiler from realizing we
+      // don't actually do any useful work with the result of the reduciton.
+      srand48(resultHost);
+      resultHost = resultDevice = 0.0;
+   };
+   auto reset2 = [&]() {
+      hostVector2.setValue( 1.0 );
+#ifdef HAVE_CUDA
+      deviceVector2.setValue( 1.0 );
+#endif
+   };
+   auto reset12 = [&]() {
+      reset1();
+      reset2();
+   };
+
+
+   reset12();
+
+
+   auto maxHost = [&]() {
+      resultHost = hostVector.max();
+   };
+   auto maxHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionMax< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto maxCuda = [&]() {
+      resultDevice = deviceVector.max();
+   };
+   benchmark.setOperation( "max", datasetSize );
+   benchmark.time( reset1, "CPU", maxHost );
+   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", maxCuda );
+#endif
+
+
+   auto minHost = [&]() {
+      resultHost = hostVector.min();
+   };
+   auto minHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionMin< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto minCuda = [&]() {
+      resultDevice = deviceVector.min();
+   };
+   benchmark.setOperation( "min", datasetSize );
+   benchmark.time( reset1, "CPU", minHost );
+   benchmark.time( reset1, "CPU (general)", minHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", minCuda );
+#endif
+
+
+   auto absMaxHost = [&]() {
+      resultHost = hostVector.absMax();
+   };
+   auto absMaxHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto absMaxCuda = [&]() {
+      resultDevice = deviceVector.absMax();
+   };
+#ifdef HAVE_CUDA
+   auto absMaxCublas = [&]() {
+      int index = 0;
+      cublasIgamax( cublasHandle, size,
+                    deviceVector.getData(), 1,
+                    &index );
+      resultDevice = deviceVector.getElement( index );
+   };
+#endif
+   benchmark.setOperation( "absMax", datasetSize );
+   benchmark.time( reset1, "CPU", absMaxHost );
+   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", absMaxCuda );
+   benchmark.time( reset1, "cuBLAS", absMaxCublas );
+#endif
+
+
+   auto absMinHost = [&]() {
+      resultHost = hostVector.absMin();
+   };
+   auto absMinHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto absMinCuda = [&]() {
+      resultDevice = deviceVector.absMin();
+   };
+#ifdef HAVE_CUDA
+   auto absMinCublas = [&]() {
+      int index = 0;
+      cublasIgamin( cublasHandle, size,
+                    deviceVector.getData(), 1,
+                    &index );
+      resultDevice = deviceVector.getElement( index );
+   };
+#endif
+   benchmark.setOperation( "absMin", datasetSize );
+   benchmark.time( reset1, "CPU", absMinHost );
+   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", absMinCuda );
+   benchmark.time( reset1, "cuBLAS", absMinCublas );
+#endif
+
+
+   auto sumHost = [&]() {
+      resultHost = hostVector.sum();
+   };
+   auto sumHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionSum< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto sumCuda = [&]() {
+      resultDevice = deviceVector.sum();
+   };
+   benchmark.setOperation( "sum", datasetSize );
+   benchmark.time( reset1, "CPU", sumHost );
+   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", sumCuda );
+#endif
+
+
+   auto l1normHost = [&]() {
+      resultHost = hostVector.lpNorm( 1.0 );
+   };
+   auto l1normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l1normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 1.0 );
+   };
+#ifdef HAVE_CUDA
+   auto l1normCublas = [&]() {
+      cublasGasum( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "l1 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l1normHost );
+   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l1normCuda );
+   benchmark.time( reset1, "cuBLAS", l1normCublas );
+#endif
+
+
+   auto l2normHost = [&]() {
+      resultHost = hostVector.lpNorm( 2.0 );
+   };
+   auto l2normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l2normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 2.0 );
+   };
+#ifdef HAVE_CUDA
+   auto l2normCublas = [&]() {
+      cublasGnrm2( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "l2 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l2normHost );
+   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l2normCuda );
+   benchmark.time( reset1, "cuBLAS", l2normCublas );
+#endif
+
+
+   auto l3normHost = [&]() {
+      resultHost = hostVector.lpNorm( 3.0 );
+   };
+   auto l3normHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
+      operation.setPower( 3.0 );
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              ( Real* ) 0,
+              result );
+      return result;
+   };
+   auto l3normCuda = [&]() {
+      resultDevice = deviceVector.lpNorm( 3.0 );
+   };
+   benchmark.setOperation( "l3 norm", datasetSize );
+   benchmark.time( reset1, "CPU", l3normHost );
+   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", l3normCuda );
+#endif
+
+
+   auto scalarProductHost = [&]() {
+      resultHost = hostVector.scalarProduct( hostVector2 );
+   };
+   auto scalarProductHostGeneral = [&]() {
+      Real result( 0 );
+      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
+      Containers::Algorithms::Reduction< Devices::Host >::reduce(
+              operation,
+              hostVector.getSize(),
+              hostVector.getData(),
+              hostVector2.getData(),
+              result );
+      return result;
+   };
+   auto scalarProductCuda = [&]() {
+      resultDevice = deviceVector.scalarProduct( deviceVector2 );
+   };
+#ifdef HAVE_CUDA
+   auto scalarProductCublas = [&]() {
+      cublasGdot( cublasHandle, size,
+                  deviceVector.getData(), 1,
+                  deviceVector2.getData(), 1,
+                  &resultDevice );
+   };
+#endif
+   benchmark.setOperation( "scalar product", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", scalarProductHost );
+   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", scalarProductCuda );
+   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+#endif
+
+   /*
+   std::cout << "Benchmarking prefix-sum:" << std::endl;
+   timer.reset();
+   timer.start();
+   hostVector.computePrefixSum();
+   timer.stop();
+   timeHost = timer.getTime();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
+
+   timer.reset();
+   timer.start();
+   deviceVector.computePrefixSum();
+   timer.stop();
+   timeDevice = timer.getTime();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
+   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
+
+   HostVector auxHostVector;
+   auxHostVector.setLike( deviceVector );
+   auxHostVector = deviceVector;
+   for( int i = 0; i < size; i++ )
+      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
+      {
+         std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
+      }
+   */
+
+
+   auto multiplyHost = [&]() {
+      hostVector *= 0.5;
+   };
+   auto multiplyCuda = [&]() {
+      deviceVector *= 0.5;
+   };
+#ifdef HAVE_CUDA
+   auto multiplyCublas = [&]() {
+      const Real alpha = 0.5;
+      cublasGscal( cublasHandle, size,
+                   &alpha,
+                   deviceVector.getData(), 1 );
+   };
+#endif
+   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
+   benchmark.time( reset1, "CPU", multiplyHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", multiplyCuda );
+   benchmark.time( reset1, "cuBLAS", multiplyCublas );
+#endif
+
+
+   auto addVectorHost = [&]() {
+      hostVector.addVector( hostVector2 );
+   };
+   auto addVectorCuda = [&]() {
+      deviceVector.addVector( deviceVector2 );
+   };
+#ifdef HAVE_CUDA
+   auto addVectorCublas = [&]() {
+      const Real alpha = 1.0;
+      cublasGaxpy( cublasHandle, size,
+                   &alpha,
+                   deviceVector2.getData(), 1,
+                   deviceVector.getData(), 1 );
+   };
+#endif
+   benchmark.setOperation( "vector addition", 3 * datasetSize );
+   benchmark.time( reset1, "CPU", addVectorHost );
+#ifdef HAVE_CUDA
+   benchmark.time( reset1, "GPU", addVectorCuda );
+   benchmark.time( reset1, "cuBLAS", addVectorCublas );
+#endif
+
+
+#ifdef HAVE_CUDA
+   cublasDestroy( cublasHandle );
+#endif
+
+   return true;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..60decddc8c04c9422451b1581f8db044655a93ba
--- /dev/null
+++ b/src/Benchmarks/Benchmarks.h
@@ -0,0 +1,456 @@
+/***************************************************************************
+                          benchmarks.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <vector>
+
+#include <TNL/Timer.h>
+#include <TNL/String.h>
+#include <TNL/Solvers/IterativeSolverMonitor.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+const double oneGB = 1024.0 * 1024.0 * 1024.0;
+
+template< typename ComputeFunction,
+          typename ResetFunction,
+          typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+double
+timeFunction( ComputeFunction compute,
+              ResetFunction reset,
+              int loops,
+              Monitor && monitor = Monitor() )
+{
+   // the timer is constructed zero-initialized and stopped
+   Timer timer;
+
+   // set timer to the monitor
+   monitor.setTimer( timer );
+
+   // warm up
+   reset();
+   compute();
+
+   for(int i = 0; i < loops; ++i) {
+      // abuse the monitor's "time" for loops
+      monitor.setTime( i + 1 );
+
+      reset();
+
+      // Explicit synchronization of the CUDA device
+      // TODO: not necessary for host computations
+#ifdef HAVE_CUDA
+      cudaDeviceSynchronize();
+#endif
+      timer.start();
+      compute();
+#ifdef HAVE_CUDA
+      cudaDeviceSynchronize();
+#endif
+      timer.stop();
+   }
+
+   return timer.getRealTime();
+}
+
+
+class Logging
+{
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using HeaderElements = std::initializer_list< String >;
+   using RowElements = std::initializer_list< double >;
+
+   Logging( bool verbose = true )
+   : verbose(verbose)
+   {}
+
+   void
+   writeTitle( const String & title )
+   {
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+      log << ": title = " << title << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
+         if( verbose )
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+         log << ": " << it.first << " = " << it.second << std::endl;
+      }
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      using namespace std;
+
+      if( verbose && header_changed ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.first;
+         }
+
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
+
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+
+         header_changed = false;
+      }
+
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // dump stacked spanning columns
+      if( horizontalGroups.size() > 0 )
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+
+      log << header_indent << " " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << header_indent << "! " << it << std::endl;
+      }
+
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second--;
+         header_indent.pop_back();
+      }
+   }
+
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      using namespace std;
+
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 );
+            if( it != 0.0 )std::cout << it;
+            else std::cout << "N/A";
+         }
+         std::cout << std::endl;
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+
+      // benchmark data are indented
+      const String indent = "    ";
+      for( auto & it : subElements ) {
+         if( it != 0.0 ) log << indent << it << std::endl;
+         else log << indent << "N/A" << std::endl;
+      }
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // make sure there is a header column for the message
+      if( horizontalGroups.size() == 0 )
+         horizontalGroups.push_back( {"", 1} );
+
+      // dump stacked spanning columns
+      while( horizontalGroups.back().second <= 0 ) {
+         horizontalGroups.pop_back();
+         header_indent.pop_back();
+      }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second -= colspan;
+         header_indent.pop_back();
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+      log << msg << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+      log << std::endl;
+      header_indent = body_indent = "";
+      header_changed = true;
+      horizontalGroups.clear();
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      closeTable();
+      logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   bool verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+
+class Benchmark
+: protected Logging
+{
+public:
+   using Logging::MetadataElement;
+   using Logging::MetadataMap;
+   using Logging::MetadataColumns;
+
+   Benchmark( int loops = 10,
+              bool verbose = true )
+   : Logging(verbose), loops(loops)
+   {}
+
+   // TODO: ensure that this is not called in the middle of the benchmark
+   // (or just remove it completely?)
+   void
+   setLoops( int loops )
+   {
+      this->loops = loops;
+   }
+
+   // Marks the start of a new benchmark
+   void
+   newBenchmark( const String & title )
+   {
+      closeTable();
+      writeTitle( title );
+      monitor.setStage( title.getString() );
+   }
+
+   // Marks the start of a new benchmark (with custom metadata)
+   void
+   newBenchmark( const String & title,
+                 MetadataMap metadata )
+   {
+      closeTable();
+      writeTitle( title );
+      monitor.setStage( title.getString() );
+      // add loops to metadata
+      metadata["loops"] = String(loops);
+      writeMetadata( metadata );
+   }
+
+   // Sets metadata columns -- values used for all subsequent rows until
+   // the next call to this function.
+   void
+   setMetadataColumns( const MetadataColumns & metadata )
+   {
+      if( metadataColumns != metadata )
+         header_changed = true;
+      metadataColumns = metadata;
+   }
+
+   // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
+   // Sets current operation -- operations expand the table vertically
+   //  - baseTime should be reset to 0.0 for most operations, but sometimes
+   //    it is useful to override it
+   //  - Order of operations inside a "Benchmark" does not matter, rows can be
+   //    easily sorted while converting to HTML.)
+   void
+   setOperation( const String & operation,
+                 const double datasetSize = 0.0, // in GB
+                 const double baseTime = 0.0 )
+   {
+      if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
+         metadataColumns[ 0 ].second = operation;
+      }
+      else {
+         metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
+      }
+      setOperation( datasetSize, baseTime );
+      header_changed = true;
+   }
+
+   void
+   setOperation( const double datasetSize = 0.0,
+                 const double baseTime = 0.0 )
+   {
+      this->datasetSize = datasetSize;
+      this->baseTime = baseTime;
+   }
+
+   // Creates new horizontal groups inside a benchmark -- increases the number
+   // of columns in the "Benchmark", implies column spanning.
+   // (Useful e.g. for SpMV formats, different configurations etc.)
+   void
+   createHorizontalGroup( const String & name,
+                          int subcolumns )
+   {
+      if( horizontalGroups.size() == 0 ) {
+         horizontalGroups.push_back( {name, subcolumns} );
+      }
+      else {
+         auto & last = horizontalGroups.back();
+         if( last.first != name && last.second > 0 ) {
+            horizontalGroups.push_back( {name, subcolumns} );
+         }
+         else {
+            last.first = name;
+            last.second = subcolumns;
+         }
+      }
+   }
+
+   // Times a single ComputeFunction. Subsequent calls implicitly split
+   // the current "horizontal group" into sub-columns identified by
+   // "performer", which are further split into "bandwidth", "time" and
+   // "speedup" columns.
+   // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
+   // Also terminates the recursion of the following variadic template.
+   template< typename ResetFunction,
+             typename ComputeFunction >
+   double
+   time( ResetFunction reset,
+         const String & performer,
+         ComputeFunction & compute )
+   {
+      double time;
+      if( verbose ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         time = timeFunction( compute, reset, loops, monitor );
+      }
+      else {
+         time = timeFunction( compute, reset, loops, monitor );
+      }
+
+      const double bandwidth = datasetSize / time;
+      const double speedup = this->baseTime / time;
+      if( this->baseTime == 0.0 )
+         this->baseTime = time;
+
+      writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
+      writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );
+
+      return this->baseTime;
+   }
+
+   // Recursive template function to deal with multiple computations with the
+   // same reset function.
+   template< typename ResetFunction,
+             typename ComputeFunction,
+             typename... NextComputations >
+   inline double
+   time( ResetFunction reset,
+         const String & performer,
+         ComputeFunction & compute,
+         NextComputations & ... nextComputations )
+   {
+      time( reset, performer, compute );
+      time( reset, nextComputations... );
+      return this->baseTime;
+   }
+
+   // Adds an error message to the log. Should be called in places where the
+   // "time" method could not be called (e.g. due to failed allocation).
+   void
+   addErrorMessage( const char* msg,
+                    int numberOfComputations = 1 )
+   {
+      // each computation has 3 subcolumns
+      const int colspan = 3 * numberOfComputations;
+      writeErrorMessage( msg, colspan );
+   }
+
+   using Logging::save;
+
+protected:
+   int loops;
+   double datasetSize = 0.0;
+   double baseTime = 0.0;
+   Solvers::IterativeSolverMonitor< double, int > monitor;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e34ade5be305bdee495749a38db1b9af004a6a92
--- /dev/null
+++ b/src/Benchmarks/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_subdirectory( HeatEquation )
+add_subdirectory( BLAS )
+add_subdirectory( SpMV )
+add_subdirectory( LinearSolvers )
+
+set( headers
+         Benchmarks.h
+)
+
+install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks )
diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h
rename to src/Benchmarks/HeatEquation/BenchmarkLaplace.h
diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h
rename to src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquation/CMakeLists.txt
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/CMakeLists.txt
rename to src/Benchmarks/HeatEquation/CMakeLists.txt
diff --git a/tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h b/src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h
rename to src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h
rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h
diff --git a/tests/benchmarks/heat-equation-benchmark/TestGridEntity.h b/src/Benchmarks/HeatEquation/TestGridEntity.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/TestGridEntity.h
rename to src/Benchmarks/HeatEquation/TestGridEntity.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h b/src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h
rename to src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h
rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h
rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h b/src/Benchmarks/HeatEquation/Tuning/SimpleCell.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h
rename to src/Benchmarks/HeatEquation/Tuning/SimpleCell.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h
rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h
rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h
diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h b/src/Benchmarks/HeatEquation/Tuning/tunning.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h
rename to src/Benchmarks/HeatEquation/Tuning/tunning.h
diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/src/Benchmarks/HeatEquation/pure-c-rhs.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
rename to src/Benchmarks/HeatEquation/pure-c-rhs.h
diff --git a/tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark b/src/Benchmarks/HeatEquation/run-HeatEquationBenchmark
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark
rename to src/Benchmarks/HeatEquation/run-HeatEquationBenchmark
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h b/src/Benchmarks/HeatEquation/tnlTestGrid2D.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h
rename to src/Benchmarks/HeatEquation/tnlTestGrid2D.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h b/src/Benchmarks/HeatEquation/tnlTestGridEntity.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h
rename to src/Benchmarks/HeatEquation/tnlTestGridEntity.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h
similarity index 100%
rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h
rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h
diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a95c92f791c7d7a0176415fd4968d8aa6bb8982
--- /dev/null
+++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
+endif()
+
+install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.cpp
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.cu
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-linear-solvers.h
rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a73e6738cdb8bb4effd889960f3ffcd5b7255b90
--- /dev/null
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl )
+endif()
+
+install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.cpp
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp
diff --git a/tests/benchmarks/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.cu
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cu
diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
similarity index 100%
rename from tests/benchmarks/tnl-benchmark-spmv.h
rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.h
diff --git a/tests/benchmarks/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
similarity index 100%
rename from tests/benchmarks/tnlCusparseCSRMatrix.h
rename to src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
diff --git a/tests/benchmarks/share/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt
similarity index 100%
rename from tests/benchmarks/share/CMakeLists.txt
rename to src/Benchmarks/scripts/CMakeLists.txt
diff --git a/tests/benchmarks/share/convert-matrices b/src/Benchmarks/scripts/convert-matrices
similarity index 100%
rename from tests/benchmarks/share/convert-matrices
rename to src/Benchmarks/scripts/convert-matrices
diff --git a/tests/benchmarks/share/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf
similarity index 100%
rename from tests/benchmarks/share/cuda-profiler.conf
rename to src/Benchmarks/scripts/cuda-profiler.conf
diff --git a/tests/benchmarks/share/draw-matrices b/src/Benchmarks/scripts/draw-matrices
similarity index 100%
rename from tests/benchmarks/share/draw-matrices
rename to src/Benchmarks/scripts/draw-matrices
diff --git a/tests/benchmarks/share/florida-matrix-market b/src/Benchmarks/scripts/florida-matrix-market
similarity index 100%
rename from tests/benchmarks/share/florida-matrix-market
rename to src/Benchmarks/scripts/florida-matrix-market
diff --git a/tests/benchmarks/share/get-matrices b/src/Benchmarks/scripts/get-matrices
similarity index 100%
rename from tests/benchmarks/share/get-matrices
rename to src/Benchmarks/scripts/get-matrices
diff --git a/tests/benchmarks/share/matrix-market b/src/Benchmarks/scripts/matrix-market
similarity index 100%
rename from tests/benchmarks/share/matrix-market
rename to src/Benchmarks/scripts/matrix-market
diff --git a/tests/benchmarks/share/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl
similarity index 100%
rename from tests/benchmarks/share/process-cuda-profile.pl
rename to src/Benchmarks/scripts/process-cuda-profile.pl
diff --git a/tests/benchmarks/share/run-matrix-solvers-benchmark b/src/Benchmarks/scripts/run-matrix-solvers-benchmark
similarity index 100%
rename from tests/benchmarks/share/run-matrix-solvers-benchmark
rename to src/Benchmarks/scripts/run-matrix-solvers-benchmark
diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers
similarity index 100%
rename from tests/benchmarks/share/run-tnl-benchmark-linear-solvers
rename to src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers
diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
similarity index 100%
rename from tests/benchmarks/share/run-tnl-benchmark-spmv
rename to src/Benchmarks/scripts/run-tnl-benchmark-spmv
diff --git a/tests/benchmarks/share/tnl-run-heat-equation-benchmark b/src/Benchmarks/scripts/tnl-run-heat-equation-benchmark
similarity index 100%
rename from tests/benchmarks/share/tnl-run-heat-equation-benchmark
rename to src/Benchmarks/scripts/tnl-run-heat-equation-benchmark
diff --git a/tests/benchmarks/share/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark
similarity index 100%
rename from tests/benchmarks/share/tnl-run-spmv-benchmark
rename to src/Benchmarks/scripts/tnl-run-spmv-benchmark
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a2086691dc4ccbe4c263fe5f11b40e361d957d73..ee275cafcc5f5f2a506fb1d2a79037f877963173 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,16 @@
 ADD_SUBDIRECTORY( TNL )
 
+# Note that it is important to start building examples as soon as possible,
+# because they take the longest time and other stuff can be pipelined before
+# they are finished (at least with Ninja).
+if( ${WITH_EXAMPLES} )
+   add_subdirectory( Examples )
+endif()
+
+if( ${WITH_BENCHMARKS} )
+   ADD_SUBDIRECTORY( Benchmarks )
+endif()
+
 if( ${WITH_TOOLS} )
    ADD_SUBDIRECTORY( Tools )
 endif()
diff --git a/examples/CMakeLists.txt b/src/Examples/CMakeLists.txt
similarity index 100%
rename from examples/CMakeLists.txt
rename to src/Examples/CMakeLists.txt
diff --git a/examples/heat-equation/CMakeLists.txt b/src/Examples/heat-equation/CMakeLists.txt
similarity index 100%
rename from examples/heat-equation/CMakeLists.txt
rename to src/Examples/heat-equation/CMakeLists.txt
diff --git a/examples/heat-equation/HeatEquationBuildConfigTag.h b/src/Examples/heat-equation/HeatEquationBuildConfigTag.h
similarity index 100%
rename from examples/heat-equation/HeatEquationBuildConfigTag.h
rename to src/Examples/heat-equation/HeatEquationBuildConfigTag.h
diff --git a/examples/heat-equation/tnl-heat-equation-eoc.cpp b/src/Examples/heat-equation/tnl-heat-equation-eoc.cpp
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation-eoc.cpp
rename to src/Examples/heat-equation/tnl-heat-equation-eoc.cpp
diff --git a/examples/heat-equation/tnl-heat-equation-eoc.cu b/src/Examples/heat-equation/tnl-heat-equation-eoc.cu
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation-eoc.cu
rename to src/Examples/heat-equation/tnl-heat-equation-eoc.cu
diff --git a/examples/heat-equation/tnl-heat-equation-eoc.h b/src/Examples/heat-equation/tnl-heat-equation-eoc.h
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation-eoc.h
rename to src/Examples/heat-equation/tnl-heat-equation-eoc.h
diff --git a/examples/heat-equation/tnl-heat-equation.cpp b/src/Examples/heat-equation/tnl-heat-equation.cpp
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation.cpp
rename to src/Examples/heat-equation/tnl-heat-equation.cpp
diff --git a/examples/heat-equation/tnl-heat-equation.cu b/src/Examples/heat-equation/tnl-heat-equation.cu
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation.cu
rename to src/Examples/heat-equation/tnl-heat-equation.cu
diff --git a/examples/heat-equation/tnl-heat-equation.h b/src/Examples/heat-equation/tnl-heat-equation.h
similarity index 100%
rename from examples/heat-equation/tnl-heat-equation.h
rename to src/Examples/heat-equation/tnl-heat-equation.h
diff --git a/examples/heat-equation/tnl-run-heat-equation b/src/Examples/heat-equation/tnl-run-heat-equation
similarity index 100%
rename from examples/heat-equation/tnl-run-heat-equation
rename to src/Examples/heat-equation/tnl-run-heat-equation
diff --git a/examples/heat-equation/tnl-run-heat-equation-eoc-test b/src/Examples/heat-equation/tnl-run-heat-equation-eoc-test
similarity index 100%
rename from examples/heat-equation/tnl-run-heat-equation-eoc-test
rename to src/Examples/heat-equation/tnl-run-heat-equation-eoc-test
diff --git a/examples/inviscid-flow/CMakeLists.txt b/src/Examples/inviscid-flow/CMakeLists.txt
similarity index 100%
rename from examples/inviscid-flow/CMakeLists.txt
rename to src/Examples/inviscid-flow/CMakeLists.txt
diff --git a/examples/inviscid-flow/CompressibleConservativeVariables.h b/src/Examples/inviscid-flow/CompressibleConservativeVariables.h
similarity index 100%
rename from examples/inviscid-flow/CompressibleConservativeVariables.h
rename to src/Examples/inviscid-flow/CompressibleConservativeVariables.h
diff --git a/examples/inviscid-flow/LaxFridrichs.h b/src/Examples/inviscid-flow/LaxFridrichs.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichs.h
rename to src/Examples/inviscid-flow/LaxFridrichs.h
diff --git a/examples/inviscid-flow/LaxFridrichsContinuity.h b/src/Examples/inviscid-flow/LaxFridrichsContinuity.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsContinuity.h
rename to src/Examples/inviscid-flow/LaxFridrichsContinuity.h
diff --git a/examples/inviscid-flow/LaxFridrichsEnergy.h b/src/Examples/inviscid-flow/LaxFridrichsEnergy.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsEnergy.h
rename to src/Examples/inviscid-flow/LaxFridrichsEnergy.h
diff --git a/examples/inviscid-flow/LaxFridrichsMomentumBase.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumBase.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsMomentumBase.h
rename to src/Examples/inviscid-flow/LaxFridrichsMomentumBase.h
diff --git a/examples/inviscid-flow/LaxFridrichsMomentumX.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumX.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsMomentumX.h
rename to src/Examples/inviscid-flow/LaxFridrichsMomentumX.h
diff --git a/examples/inviscid-flow/LaxFridrichsMomentumY.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumY.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsMomentumY.h
rename to src/Examples/inviscid-flow/LaxFridrichsMomentumY.h
diff --git a/examples/inviscid-flow/LaxFridrichsMomentumZ.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumZ.h
similarity index 100%
rename from examples/inviscid-flow/LaxFridrichsMomentumZ.h
rename to src/Examples/inviscid-flow/LaxFridrichsMomentumZ.h
diff --git a/examples/inviscid-flow/PhysicalVariablesGetter.h b/src/Examples/inviscid-flow/PhysicalVariablesGetter.h
similarity index 100%
rename from examples/inviscid-flow/PhysicalVariablesGetter.h
rename to src/Examples/inviscid-flow/PhysicalVariablesGetter.h
diff --git a/examples/inviscid-flow/RiemannProblemInitialCondition.h b/src/Examples/inviscid-flow/RiemannProblemInitialCondition.h
similarity index 100%
rename from examples/inviscid-flow/RiemannProblemInitialCondition.h
rename to src/Examples/inviscid-flow/RiemannProblemInitialCondition.h
diff --git a/examples/inviscid-flow/euler.cpp b/src/Examples/inviscid-flow/euler.cpp
similarity index 100%
rename from examples/inviscid-flow/euler.cpp
rename to src/Examples/inviscid-flow/euler.cpp
diff --git a/examples/inviscid-flow/euler.cu b/src/Examples/inviscid-flow/euler.cu
similarity index 100%
rename from examples/inviscid-flow/euler.cu
rename to src/Examples/inviscid-flow/euler.cu
diff --git a/examples/inviscid-flow/euler.h b/src/Examples/inviscid-flow/euler.h
similarity index 100%
rename from examples/inviscid-flow/euler.h
rename to src/Examples/inviscid-flow/euler.h
diff --git a/examples/inviscid-flow/eulerBuildConfigTag.h b/src/Examples/inviscid-flow/eulerBuildConfigTag.h
similarity index 100%
rename from examples/inviscid-flow/eulerBuildConfigTag.h
rename to src/Examples/inviscid-flow/eulerBuildConfigTag.h
diff --git a/examples/inviscid-flow/eulerProblem.h b/src/Examples/inviscid-flow/eulerProblem.h
similarity index 100%
rename from examples/inviscid-flow/eulerProblem.h
rename to src/Examples/inviscid-flow/eulerProblem.h
diff --git a/examples/inviscid-flow/eulerProblem_impl.h b/src/Examples/inviscid-flow/eulerProblem_impl.h
similarity index 100%
rename from examples/inviscid-flow/eulerProblem_impl.h
rename to src/Examples/inviscid-flow/eulerProblem_impl.h
diff --git a/examples/inviscid-flow/eulerRhs.h b/src/Examples/inviscid-flow/eulerRhs.h
similarity index 100%
rename from examples/inviscid-flow/eulerRhs.h
rename to src/Examples/inviscid-flow/eulerRhs.h
diff --git a/examples/inviscid-flow/run-euler b/src/Examples/inviscid-flow/run-euler
similarity index 100%
rename from examples/inviscid-flow/run-euler
rename to src/Examples/inviscid-flow/run-euler
diff --git a/examples/mean-curvature-flow/CMakeLists.txt b/src/Examples/mean-curvature-flow/CMakeLists.txt
similarity index 100%
rename from examples/mean-curvature-flow/CMakeLists.txt
rename to src/Examples/mean-curvature-flow/CMakeLists.txt
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.cu b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cu
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.cu
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cu
diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.h b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.h
similarity index 100%
rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.h
rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.h
diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow
similarity index 100%
rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow
rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow
diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video
similarity index 100%
rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video
rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video
diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test
similarity index 100%
rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test
rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test
diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos
similarity index 100%
rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos
rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos
diff --git a/examples/narrow-band/CMakeLists.txt b/src/Examples/narrow-band/CMakeLists.txt
similarity index 100%
rename from examples/narrow-band/CMakeLists.txt
rename to src/Examples/narrow-band/CMakeLists.txt
diff --git a/examples/narrow-band/MainBuildConfig.h b/src/Examples/narrow-band/MainBuildConfig.h
similarity index 100%
rename from examples/narrow-band/MainBuildConfig.h
rename to src/Examples/narrow-band/MainBuildConfig.h
diff --git a/examples/narrow-band/main.cpp b/src/Examples/narrow-band/main.cpp
similarity index 100%
rename from examples/narrow-band/main.cpp
rename to src/Examples/narrow-band/main.cpp
diff --git a/examples/narrow-band/main.cu b/src/Examples/narrow-band/main.cu
similarity index 100%
rename from examples/narrow-band/main.cu
rename to src/Examples/narrow-band/main.cu
diff --git a/examples/narrow-band/main.h b/src/Examples/narrow-band/main.h
similarity index 100%
rename from examples/narrow-band/main.h
rename to src/Examples/narrow-band/main.h
diff --git a/examples/narrow-band/narrowBandConfig.h b/src/Examples/narrow-band/narrowBandConfig.h
similarity index 100%
rename from examples/narrow-band/narrowBandConfig.h
rename to src/Examples/narrow-band/narrowBandConfig.h
diff --git a/examples/narrow-band/tnlNarrowBand.h b/src/Examples/narrow-band/tnlNarrowBand.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand.h
rename to src/Examples/narrow-band/tnlNarrowBand.h
diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
rename to src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
rename to src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
diff --git a/examples/narrow-band/tnlNarrowBand2D_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_impl.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand2D_impl.h
rename to src/Examples/narrow-band/tnlNarrowBand2D_impl.h
diff --git a/examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h b/src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h
rename to src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h
diff --git a/examples/narrow-band/tnlNarrowBand3D_impl.h b/src/Examples/narrow-band/tnlNarrowBand3D_impl.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand3D_impl.h
rename to src/Examples/narrow-band/tnlNarrowBand3D_impl.h
diff --git a/examples/narrow-band/tnlNarrowBand_CUDA.h b/src/Examples/narrow-band/tnlNarrowBand_CUDA.h
similarity index 100%
rename from examples/narrow-band/tnlNarrowBand_CUDA.h
rename to src/Examples/narrow-band/tnlNarrowBand_CUDA.h
diff --git a/examples/navier-stokes/CMakeLists.txt b/src/Examples/navier-stokes/CMakeLists.txt
similarity index 100%
rename from examples/navier-stokes/CMakeLists.txt
rename to src/Examples/navier-stokes/CMakeLists.txt
diff --git a/examples/navier-stokes/Makefile b/src/Examples/navier-stokes/Makefile
similarity index 100%
rename from examples/navier-stokes/Makefile
rename to src/Examples/navier-stokes/Makefile
diff --git a/examples/navier-stokes/main.cpp b/src/Examples/navier-stokes/main.cpp
similarity index 100%
rename from examples/navier-stokes/main.cpp
rename to src/Examples/navier-stokes/main.cpp
diff --git a/examples/navier-stokes/make-png-from-gnuplot b/src/Examples/navier-stokes/make-png-from-gnuplot
similarity index 100%
rename from examples/navier-stokes/make-png-from-gnuplot
rename to src/Examples/navier-stokes/make-png-from-gnuplot
diff --git a/examples/navier-stokes/make-png-vectors-from-gnuplot b/src/Examples/navier-stokes/make-png-vectors-from-gnuplot
similarity index 100%
rename from examples/navier-stokes/make-png-vectors-from-gnuplot
rename to src/Examples/navier-stokes/make-png-vectors-from-gnuplot
diff --git a/examples/navier-stokes/merge-figures b/src/Examples/navier-stokes/merge-figures
similarity index 100%
rename from examples/navier-stokes/merge-figures
rename to src/Examples/navier-stokes/merge-figures
diff --git a/examples/navier-stokes/navier-stokes.cfg.desc b/src/Examples/navier-stokes/navier-stokes.cfg.desc
similarity index 100%
rename from examples/navier-stokes/navier-stokes.cfg.desc
rename to src/Examples/navier-stokes/navier-stokes.cfg.desc
diff --git a/examples/navier-stokes/navierStokesBoundaryConditions.h b/src/Examples/navier-stokes/navierStokesBoundaryConditions.h
similarity index 100%
rename from examples/navier-stokes/navierStokesBoundaryConditions.h
rename to src/Examples/navier-stokes/navierStokesBoundaryConditions.h
diff --git a/examples/navier-stokes/navierStokesBoundaryConditions_impl.h b/src/Examples/navier-stokes/navierStokesBoundaryConditions_impl.h
similarity index 100%
rename from examples/navier-stokes/navierStokesBoundaryConditions_impl.h
rename to src/Examples/navier-stokes/navierStokesBoundaryConditions_impl.h
diff --git a/examples/navier-stokes/navierStokesSetter.h b/src/Examples/navier-stokes/navierStokesSetter.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSetter.h
rename to src/Examples/navier-stokes/navierStokesSetter.h
diff --git a/examples/navier-stokes/navierStokesSetter_impl.h b/src/Examples/navier-stokes/navierStokesSetter_impl.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSetter_impl.h
rename to src/Examples/navier-stokes/navierStokesSetter_impl.h
diff --git a/examples/navier-stokes/navierStokesSolver.h b/src/Examples/navier-stokes/navierStokesSolver.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSolver.h
rename to src/Examples/navier-stokes/navierStokesSolver.h
diff --git a/examples/navier-stokes/navierStokesSolverMonitor.h b/src/Examples/navier-stokes/navierStokesSolverMonitor.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSolverMonitor.h
rename to src/Examples/navier-stokes/navierStokesSolverMonitor.h
diff --git a/examples/navier-stokes/navierStokesSolverMonitor_impl.h b/src/Examples/navier-stokes/navierStokesSolverMonitor_impl.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSolverMonitor_impl.h
rename to src/Examples/navier-stokes/navierStokesSolverMonitor_impl.h
diff --git a/examples/navier-stokes/navierStokesSolver_impl.h b/src/Examples/navier-stokes/navierStokesSolver_impl.h
similarity index 100%
rename from examples/navier-stokes/navierStokesSolver_impl.h
rename to src/Examples/navier-stokes/navierStokesSolver_impl.h
diff --git a/examples/navier-stokes/share/CMakeLists.txt b/src/Examples/navier-stokes/share/CMakeLists.txt
similarity index 100%
rename from examples/navier-stokes/share/CMakeLists.txt
rename to src/Examples/navier-stokes/share/CMakeLists.txt
diff --git a/examples/navier-stokes/share/examples/CMakeLists.txt b/src/Examples/navier-stokes/share/examples/CMakeLists.txt
similarity index 100%
rename from examples/navier-stokes/share/examples/CMakeLists.txt
rename to src/Examples/navier-stokes/share/examples/CMakeLists.txt
diff --git a/examples/navier-stokes/share/examples/cavity b/src/Examples/navier-stokes/share/examples/cavity
similarity index 100%
rename from examples/navier-stokes/share/examples/cavity
rename to src/Examples/navier-stokes/share/examples/cavity
diff --git a/examples/quad-test/.gitignore b/src/Examples/quad-test/.gitignore
similarity index 100%
rename from examples/quad-test/.gitignore
rename to src/Examples/quad-test/.gitignore
diff --git a/examples/quad-test/Makefile b/src/Examples/quad-test/Makefile
similarity index 100%
rename from examples/quad-test/Makefile
rename to src/Examples/quad-test/Makefile
diff --git a/examples/quad-test/main.cpp b/src/Examples/quad-test/main.cpp
similarity index 100%
rename from examples/quad-test/main.cpp
rename to src/Examples/quad-test/main.cpp
diff --git a/examples/quad-test/quad-test.cfg.desc b/src/Examples/quad-test/quad-test.cfg.desc
similarity index 100%
rename from examples/quad-test/quad-test.cfg.desc
rename to src/Examples/quad-test/quad-test.cfg.desc
diff --git a/examples/simple-examples/CMakeLists.txt b/src/Examples/simple-examples/CMakeLists.txt
similarity index 100%
rename from examples/simple-examples/CMakeLists.txt
rename to src/Examples/simple-examples/CMakeLists.txt
diff --git a/examples/simple-examples/large-meshfunction-example.cpp b/src/Examples/simple-examples/large-meshfunction-example.cpp
similarity index 100%
rename from examples/simple-examples/large-meshfunction-example.cpp
rename to src/Examples/simple-examples/large-meshfunction-example.cpp
diff --git a/examples/simple-examples/large-meshfunction-example.cu b/src/Examples/simple-examples/large-meshfunction-example.cu
similarity index 100%
rename from examples/simple-examples/large-meshfunction-example.cu
rename to src/Examples/simple-examples/large-meshfunction-example.cu
diff --git a/examples/simple-examples/large-meshfunction-example.h b/src/Examples/simple-examples/large-meshfunction-example.h
similarity index 100%
rename from examples/simple-examples/large-meshfunction-example.h
rename to src/Examples/simple-examples/large-meshfunction-example.h
diff --git a/examples/transport-equation/CMakeLists.txt b/src/Examples/transport-equation/CMakeLists.txt
similarity index 100%
rename from examples/transport-equation/CMakeLists.txt
rename to src/Examples/transport-equation/CMakeLists.txt
diff --git a/examples/transport-equation/tnl-run-transport-equation b/src/Examples/transport-equation/tnl-run-transport-equation
similarity index 100%
rename from examples/transport-equation/tnl-run-transport-equation
rename to src/Examples/transport-equation/tnl-run-transport-equation
diff --git a/examples/transport-equation/tnl-run-transport-equation-eoc b/src/Examples/transport-equation/tnl-run-transport-equation-eoc
similarity index 100%
rename from examples/transport-equation/tnl-run-transport-equation-eoc
rename to src/Examples/transport-equation/tnl-run-transport-equation-eoc
diff --git a/examples/transport-equation/tnl-transport-equation-eoc.cpp b/src/Examples/transport-equation/tnl-transport-equation-eoc.cpp
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation-eoc.cpp
rename to src/Examples/transport-equation/tnl-transport-equation-eoc.cpp
diff --git a/examples/transport-equation/tnl-transport-equation-eoc.cu b/src/Examples/transport-equation/tnl-transport-equation-eoc.cu
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation-eoc.cu
rename to src/Examples/transport-equation/tnl-transport-equation-eoc.cu
diff --git a/examples/transport-equation/tnl-transport-equation-eoc.h b/src/Examples/transport-equation/tnl-transport-equation-eoc.h
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation-eoc.h
rename to src/Examples/transport-equation/tnl-transport-equation-eoc.h
diff --git a/examples/transport-equation/tnl-transport-equation.cpp b/src/Examples/transport-equation/tnl-transport-equation.cpp
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation.cpp
rename to src/Examples/transport-equation/tnl-transport-equation.cpp
diff --git a/examples/transport-equation/tnl-transport-equation.cu b/src/Examples/transport-equation/tnl-transport-equation.cu
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation.cu
rename to src/Examples/transport-equation/tnl-transport-equation.cu
diff --git a/examples/transport-equation/tnl-transport-equation.h b/src/Examples/transport-equation/tnl-transport-equation.h
similarity index 100%
rename from examples/transport-equation/tnl-transport-equation.h
rename to src/Examples/transport-equation/tnl-transport-equation.h
diff --git a/examples/transport-equation/transportEquationBuildConfigTag.h b/src/Examples/transport-equation/transportEquationBuildConfigTag.h
similarity index 100%
rename from examples/transport-equation/transportEquationBuildConfigTag.h
rename to src/Examples/transport-equation/transportEquationBuildConfigTag.h
diff --git a/examples/transport-equation/transportEquationProblem.h b/src/Examples/transport-equation/transportEquationProblem.h
similarity index 100%
rename from examples/transport-equation/transportEquationProblem.h
rename to src/Examples/transport-equation/transportEquationProblem.h
diff --git a/examples/transport-equation/transportEquationProblemEoc.h b/src/Examples/transport-equation/transportEquationProblemEoc.h
similarity index 100%
rename from examples/transport-equation/transportEquationProblemEoc.h
rename to src/Examples/transport-equation/transportEquationProblemEoc.h
diff --git a/examples/transport-equation/transportEquationProblemEoc_impl.h b/src/Examples/transport-equation/transportEquationProblemEoc_impl.h
similarity index 100%
rename from examples/transport-equation/transportEquationProblemEoc_impl.h
rename to src/Examples/transport-equation/transportEquationProblemEoc_impl.h
diff --git a/examples/transport-equation/transportEquationProblem_impl.h b/src/Examples/transport-equation/transportEquationProblem_impl.h
similarity index 100%
rename from examples/transport-equation/transportEquationProblem_impl.h
rename to src/Examples/transport-equation/transportEquationProblem_impl.h
diff --git a/src/TNL/Atomic.h b/src/TNL/Atomic.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f0defe5e996d1d01d64bc5ec237a6bf1e47e96c
--- /dev/null
+++ b/src/TNL/Atomic.h
@@ -0,0 +1,348 @@
+/***************************************************************************
+                          Atomic.h  -  description
+                             -------------------
+    begin                : Sep 14, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub KlinkovskÃ½
+
+#pragma once
+
+#include <atomic>  // std::atomic
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/param-types.h>
+
+namespace TNL {
+
+template< typename T, typename Device >
+class Atomic
+{};
+
+template< typename T >
+class Atomic< T, Devices::Host >
+: public std::atomic< T >
+{
+public:
+   Atomic() noexcept = default;
+
+   // inherit constructors
+   using std::atomic< T >::atomic;
+
+   // NOTE: std::atomic is not copyable (see https://stackoverflow.com/a/15250851 for
+   // an explanation), but we need copyability for TNL::Containers::Array. Note that
+   // this copy-constructor and copy-assignment operator are not atomic as they
+   // synchronize only with respect to one or the other object.
+   Atomic( const Atomic& desired ) noexcept
+   {
+      this->store(desired.load());
+   }
+   Atomic& operator=( const Atomic& desired ) noexcept
+   {
+      this->store(desired.load());
+      return *this;
+   }
+
+   // just for compatibility with TNL::Containers::Array...
+   static String getType()
+   {
+      return "Atomic< " +
+             TNL::getType< T >() + ", " +
+             Devices::Host::getDeviceType() + " >";
+   }
+
+   // CAS loops for updating maximum and minimum
+   // reference: https://stackoverflow.com/a/16190791
+   T fetch_max( T value ) noexcept
+   {
+      const T old = *this;
+      T prev_value = old;
+      while(prev_value < value &&
+            ! this->compare_exchange_weak(prev_value, value))
+         ;
+      return old;
+   }
+
+   T fetch_min( T value ) noexcept
+   {
+      const T old = *this;
+      T prev_value = old;
+      while(prev_value > value &&
+            ! this->compare_exchange_weak(prev_value, value))
+         ;
+      return old;
+   }
+};
+
+template< typename T >
+class Atomic< T, Devices::Cuda >
+{
+public:
+   using value_type = T;
+   // FIXME
+//   using difference_type = typename std::atomic< T >::difference_type;
+
+   __cuda_callable__
+   Atomic() noexcept = default;
+
+   __cuda_callable__
+   constexpr Atomic( T desired ) noexcept : value(desired) {}
+
+   __cuda_callable__
+   T operator=( T desired ) noexcept
+   {
+      store( desired );
+      return desired;
+   }
+
+   // NOTE: std::atomic is not copyable (see https://stackoverflow.com/a/15250851 for
+   // an explanation), but we need copyability for TNL::Containers::Array. Note that
+   // this copy-constructor and copy-assignment operator are not atomic as they
+   // synchronize only with respect to one or the other object.
+   __cuda_callable__
+   Atomic( const Atomic& desired ) noexcept
+   {
+      // FIXME
+//      *this = desired.load();
+      *this = desired.value;
+   }
+   __cuda_callable__
+   Atomic& operator=( const Atomic& desired ) noexcept
+   {
+      // FIXME
+//      *this = desired.load();
+      *this = desired.value;
+      return *this;
+   }
+
+   // just for compatibility with TNL::Containers::Array...
+   static String getType()
+   {
+      return "Atomic< " +
+             TNL::getType< T >() + ", " +
+             Devices::Host::getDeviceType() + " >";
+   }
+
+   bool is_lock_free() const noexcept
+   {
+      return true;
+   }
+
+   constexpr bool is_always_lock_free() const noexcept
+   {
+      return true;
+   }
+
+   __cuda_callable__
+   void store( T desired ) noexcept
+   {
+      // CUDA does not have a native atomic store, but it can be emulated with atomic exchange
+      exchange( desired );
+   }
+
+   __cuda_callable__
+   T load() const noexcept
+   {
+      // CUDA does not have a native atomic load:
+      // https://stackoverflow.com/questions/32341081/how-to-have-atomic-load-in-cuda
+      return const_cast<Atomic*>(this)->fetch_add( 0 );
+   }
+
+   __cuda_callable__
+   operator T() const noexcept
+   {
+      return load();
+   }
+
+   __cuda_callable__
+   T exchange( T desired ) noexcept
+   {
+#ifdef __CUDA_ARCH__
+      return atomicExch( &value, desired );
+#else
+      const T old = value;
+      value = desired;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   bool compare_exchange_weak( T& expected, T desired ) noexcept
+   {
+      return compare_exchange_strong( expected, desired );
+   }
+
+   __cuda_callable__
+   bool compare_exchange_strong( T& expected, T desired ) noexcept
+   {
+#ifdef __CUDA_ARCH__
+      const T old = atomicCAS( &value, expected, desired );
+      const bool result = old == expected;
+      expected = old;
+      return result;
+#else
+      if( value == expected ) {
+         value = desired;
+         return true;
+      }
+      else {
+         expected = value;
+         return false;
+      }
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_add( T arg )
+   {
+#ifdef __CUDA_ARCH__
+      return atomicAdd( &value, arg );
+#else
+      const T old = value;
+      value += arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_sub( T arg )
+   {
+#ifdef __CUDA_ARCH__
+      return atomicSub( &value, arg );
+#else
+      const T old = value;
+      value -= arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_and( T arg )
+   {
+#ifdef __CUDA_ARCH__
+      return atomicAnd( &value, arg );
+#else
+      const T old = value;
+      value = value & arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_or( T arg )
+   {
+#ifdef __CUDA_ARCH__
+      return atomicOr( &value, arg );
+#else
+      const T old = value;
+      value = value | arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_xor( T arg )
+   {
+#ifdef __CUDA_ARCH__
+      return atomicXor( &value, arg );
+#else
+      const T old = value;
+      value = value ^ arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T operator+=( T arg ) noexcept
+   {
+      return fetch_add( arg ) + arg;
+   }
+
+   __cuda_callable__
+   T operator-=( T arg ) noexcept
+   {
+      return fetch_sub( arg ) - arg;
+   }
+
+   __cuda_callable__
+   T operator&=( T arg ) noexcept
+   {
+      return fetch_and( arg ) & arg;
+   }
+
+   __cuda_callable__
+   T operator|=( T arg ) noexcept
+   {
+      return fetch_or( arg ) | arg;
+   }
+
+   __cuda_callable__
+   T operator^=( T arg ) noexcept
+   {
+      return fetch_xor( arg ) ^ arg;
+   }
+
+   // pre-increment
+   __cuda_callable__
+   T operator++() noexcept
+   {
+      return fetch_add(1) + 1;
+   }
+
+   // post-increment
+   __cuda_callable__
+   T operator++(int) noexcept
+   {
+      return fetch_add(1);
+   }
+
+   // pre-decrement
+   __cuda_callable__
+   T operator--() noexcept
+   {
+      return fetch_sub(1) - 1;
+   }
+
+   // post-decrement
+   __cuda_callable__
+   T operator--(int) noexcept
+   {
+      return fetch_sub(1);
+   }
+
+   // extensions (methods not present in C++ standards)
+
+   __cuda_callable__
+   T fetch_max( T arg ) noexcept
+   {
+#ifdef __CUDA_ARCH__
+      return atomicMax( &value, arg );
+#else
+      const T old = value;
+      value = ( value > arg ) ? value : arg;
+      return old;
+#endif
+   }
+
+   __cuda_callable__
+   T fetch_min( T arg ) noexcept
+   {
+#ifdef __CUDA_ARCH__
+      return atomicMin( &value, arg );
+#else
+      const T old = value;
+      value = ( value < arg ) ? value : arg;
+      return old;
+#endif
+   }
+
+protected:
+   T value;
+};
+
+} // namespace TNL
diff --git a/src/TNL/CMakeLists.txt b/src/TNL/CMakeLists.txt
index 923d63b858fe28a36b0d9670a44525a630a1c519..cd07ae65910ab69bf004f69bb84231caf78aaab3 100644
--- a/src/TNL/CMakeLists.txt
+++ b/src/TNL/CMakeLists.txt
@@ -19,6 +19,7 @@ ADD_SUBDIRECTORY( legacy )
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL )
 
 set( headers
+     Atomic.h
      Assert.h
      CudaSharedMemory.h
      CudaStreamPool.h
@@ -99,31 +100,32 @@ TARGET_LINK_LIBRARIES( tnl
 
 INSTALL( TARGETS tnl DESTINATION lib )
 
-IF( BUILD_MPI )
-
-   ADD_LIBRARY( tnl-mpi_static STATIC ${tnl_SOURCES} )
-   INSTALL( TARGETS tnl-mpi_static DESTINATION lib )
-
-   if( BUILD_CUDA )
-      CUDA_ADD_LIBRARY( tnl-mpi SHARED ${tnl_CUDA__SOURCES}
-                        OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
-      # the static library with CUDA support has to be built separately
-      CUDA_ADD_LIBRARY( tnl-mpi-cuda_static STATIC ${tnl_CUDA__SOURCES} )
-      INSTALL( TARGETS tnl-mpi-cuda_static DESTINATION lib )
-   else( BUILD_CUDA )
-      ADD_LIBRARY( tnl-mpi SHARED ${tnl_SOURCES} )
-   endif( BUILD_CUDA )
-
-   SET_TARGET_PROPERTIES( tnl-mpi PROPERTIES
-                          VERSION ${tnlVersion} )
-#   SET_TARGET_PROPERTIES( tnl-mpi
-#                          LINK_INTERFACE_LIBRARIES "")
-
-
-   TARGET_LINK_LIBRARIES( tnl-mpi
-                          ${MPI_LIBRARIES} )
-   INSTALL( TARGETS tnl-mpi DESTINATION lib )
-
-endif()
+# NOTE: this is not necessary until something in the library file actually depends on MPI
+#IF( BUILD_MPI )
+#
+#   ADD_LIBRARY( tnl-mpi_static STATIC ${tnl_SOURCES} )
+#   INSTALL( TARGETS tnl-mpi_static DESTINATION lib )
+#
+#   if( BUILD_CUDA )
+#      CUDA_ADD_LIBRARY( tnl-mpi SHARED ${tnl_CUDA__SOURCES}
+#                        OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
+#      # the static library with CUDA support has to be built separately
+#      CUDA_ADD_LIBRARY( tnl-mpi-cuda_static STATIC ${tnl_CUDA__SOURCES} )
+#      INSTALL( TARGETS tnl-mpi-cuda_static DESTINATION lib )
+#   else( BUILD_CUDA )
+#      ADD_LIBRARY( tnl-mpi SHARED ${tnl_SOURCES} )
+#   endif( BUILD_CUDA )
+#
+#   SET_TARGET_PROPERTIES( tnl-mpi PROPERTIES
+#                          VERSION ${tnlVersion} )
+##   SET_TARGET_PROPERTIES( tnl-mpi
+##                          LINK_INTERFACE_LIBRARIES "")
+#
+#
+#   TARGET_LINK_LIBRARIES( tnl-mpi
+#                          ${MPI_LIBRARIES} )
+#   INSTALL( TARGETS tnl-mpi DESTINATION lib )
+#
+#endif()
 
 INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )
diff --git a/src/TNL/Communicators/CMakeLists.txt b/src/TNL/Communicators/CMakeLists.txt
index 7a58eaa2a04efe42bc444b09e28b25c51fc5d7e4..fb3193b739c75ea41ddae9ecf664592e44821d79 100644
--- a/src/TNL/Communicators/CMakeLists.txt
+++ b/src/TNL/Communicators/CMakeLists.txt
@@ -1,6 +1,7 @@
 SET( headers MpiCommunicator.h
              MpiDefs.h             
              NoDistrCommunicator.h 
+             ScopedInitializer.h
     )
 
 INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Communicators )
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index c64abccb4603e1c0924ddad56f0e52201f724c95..c233004a602f31ce8b7220b9983c9541f47f6331 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -16,7 +16,10 @@
 
 #ifdef HAVE_MPI
 #include <mpi.h>
-#include <mpi-ext.h>
+#ifdef OMPI_MAJOR_VERSION
+   // header specific to OpenMPI (needed for CUDA-aware detection)
+   #include <mpi-ext.h>
+#endif
 
 #ifdef HAVE_CUDA
     #include <TNL/Devices/Cuda.h>
@@ -74,7 +77,11 @@ class MpiCommunicator
 
       static bool isDistributed()
       {
+#ifdef HAVE_MPI
          return GetSize(AllGroup)>1;
+#else
+         return false;
+#endif
       }
 
       static void configSetup( Config::ConfigDescription& config, const String& prefix = "" )
@@ -142,7 +149,7 @@ class MpiCommunicator
          return true;
       }
 
-      static void Init(int argc, char **argv )
+      static void Init(int& argc, char**& argv )
       {
 #ifdef HAVE_MPI
          MPI_Init( &argc, &argv );
@@ -173,6 +180,8 @@ class MpiCommunicator
                std::cout.rdbuf(psbuf);
             }
          }
+#else
+         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -199,7 +208,7 @@ class MpiCommunicator
          MPI_Finalized(&finalized);
          return initialized && !finalized;
 #else
-        return false;
+         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -212,7 +221,7 @@ class MpiCommunicator
         MPI_Comm_rank(group,&rank);
         return rank;
 #else
-        return 1;
+         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -225,7 +234,7 @@ class MpiCommunicator
          MPI_Comm_size(group,&size);
          return size;
 #else
-         return 1;
+         throw Exceptions::MPISupportMissing();
 #endif
       }
 
@@ -250,6 +259,8 @@ class MpiCommunicator
             /***END OF HACK***/
 
             MPI_Dims_create(nproc, dim, distr);
+#else
+            throw Exceptions::MPISupportMissing();
 #endif
         }
 
@@ -419,7 +430,7 @@ class MpiCommunicator
             MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup);
         }
 #else
-        newGroup=oldGroup;
+         throw Exceptions::MPISupportMissing();
 #endif
       }
 
diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h
index a25b9f1bb74712390769a8d317f26317ea820613..aac58b916bf17656e9d6c33bead7a4d37441fca7 100644
--- a/src/TNL/Communicators/NoDistrCommunicator.h
+++ b/src/TNL/Communicators/NoDistrCommunicator.h
@@ -37,7 +37,7 @@ class NoDistrCommunicator
          return true;
       }
       
-      static void Init(int argc, char **argv, bool redirect=false) {}
+      static void Init(int& argc, char**& argv) {}
       
       static void setRedirection( bool redirect_ ) {}
       
diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/Communicators/ScopedInitializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2970bc628319bdf9d4c40d7a2cb32694a8148f7d
--- /dev/null
+++ b/src/TNL/Communicators/ScopedInitializer.h
@@ -0,0 +1,33 @@
+/***************************************************************************
+                          ScopedInitializer.h  -  description
+                             -------------------
+    begin                : Sep 16, 2018
+    copyright            : (C) 2005 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub KlinkovskÃ½
+
+#pragma once
+
+namespace TNL {
+namespace Communicators {
+
+template< typename Communicator >
+struct ScopedInitializer
+{
+   ScopedInitializer( int& argc, char**& argv )
+   {
+      Communicator::Init( argc, argv );
+   }
+
+   ~ScopedInitializer()
+   {
+      Communicator::Finalize();
+   }
+};
+
+} // namespace Communicators
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
index 1f21dc261ddaadad55bcc5b25357e9c6bb616ba6..bca6bdb0479eb38a329234f92305421873864dc5 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
@@ -201,10 +201,9 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
    bool result = false;
-   using Operation = Algorithms::ParallelReductionContainsValue< Element >;
-   Operation reductionContainsValue;
+   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
    reductionContainsValue.setValue( value );
-   Reduction< Devices::Cuda >::template reduce< Operation, Index >( reductionContainsValue, size, data, 0, result );
+   Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result );
    return result;
 }
 
@@ -220,10 +219,9 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
    bool result = false;
-   using Operation = Algorithms::ParallelReductionContainsOnlyValue< Element >;
-   Operation reductionContainsOnlyValue;
+   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
    reductionContainsOnlyValue.setValue( value );
-   Reduction< Devices::Cuda >::template reduce< Operation, Index >( reductionContainsOnlyValue, size, data, 0, result );
+   Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result );
    return result;
 }
 
diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h
index 026a8dd3c0224935601ef8bc5ae9c51d39bdea4d..3a8c74b6785b68357707ff9e7349a42a67e07f5f 100644
--- a/src/TNL/Functions/MeshFunction_impl.h
+++ b/src/TNL/Functions/MeshFunction_impl.h
@@ -40,9 +40,6 @@ MeshFunction( const MeshPointer& meshPointer )
 
    this->meshPointer=meshPointer;
    this->data.setSize( getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
-   TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), 
-               std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
-                         << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() );
 }
 
 template< typename Mesh,
@@ -69,7 +66,7 @@ MeshFunction( const MeshPointer& meshPointer,
 //: meshPointer( meshPointer )
 {
    TNL_ASSERT_GE( data.getSize(), meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
-                  "The input vector is not large enough for binding to the mesh function." );      
+                  "The input vector is not large enough for binding to the mesh function." );
     setupSynchronizer(meshPointer->getDistributedMesh());
 
    this->meshPointer=meshPointer;
@@ -88,7 +85,7 @@ MeshFunction( const MeshPointer& meshPointer,
 //: meshPointer( meshPointer )
 {
    TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
-                  "The input vector is not large enough for binding to the mesh function." );      
+                  "The input vector is not large enough for binding to the mesh function." );
 
     setupSynchronizer(meshPointer->getDistributedMesh());
 
@@ -203,10 +200,9 @@ MeshFunction< Mesh, MeshEntityDimension, Real >::
 bind( const Vector& data,
       const IndexType& offset )
 {
+   TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
+                  "The input vector is not large enough for binding to the mesh function." );
    this->data.bind( data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
-   TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), 
-               std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
-                         << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() );   
 }
 
 template< typename Mesh,
@@ -219,9 +215,9 @@ bind( const MeshPointer& meshPointer,
       const Vector& data,
       const IndexType& offset )
 {
-   TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), 
-                  "The input vector is not large enough for binding to the mesh function." );    
-   
+   TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
+                  "The input vector is not large enough for binding to the mesh function." );
+
    setupSynchronizer(meshPointer->getDistributedMesh());
    this->meshPointer=meshPointer;
    this->data.bind( data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() );
@@ -237,8 +233,8 @@ bind( const MeshPointer& meshPointer,
       const Pointers::SharedPointer<  Vector >& data,
       const IndexType& offset )
 {
-   TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), 
-                   "The input vector is not large enough for binding to the mesh function." );      
+   TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
+                  "The input vector is not large enough for binding to the mesh function." );
 
    setupSynchronizer(meshPointer->getDistributedMesh());
    this->meshPointer=meshPointer;
@@ -478,11 +474,10 @@ bool
 MeshFunction< Mesh, MeshEntityDimension, Real >::
 save( File& file ) const
 {
-   TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), 
-               std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
-                         << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() );
+   TNL_ASSERT_EQ( this->data.getSize(), this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(),
+                  "Size of the mesh function data does not match the mesh." );
    if( ! Object::save( file ) )
-      return false;              
+      return false;
    return this->data.save( file );
 }
 
@@ -494,12 +489,12 @@ MeshFunction< Mesh, MeshEntityDimension, Real >::
 load( File& file )
 {
    if( ! Object::load( file ) )
-      return false;   
+      return false;
    if( ! this->data.load( file ) )
       return false;
    const IndexType meshSize = this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >();
    if( this->data.getSize() != meshSize )
-   {      
+   {
       std::cerr << "Size of the data loaded to the mesh function (" << this->data.getSize() << ") does not fit with the mesh size (" << meshSize << ")." << std::endl;
       return false;
    }
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index 2d81bea6678a41508ea8a1bdfa19d93b0c831c01..67e9f5e082af0b238e1d045bc516bf578efe82a0 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -168,24 +168,6 @@ T sign( const T& a )
    return ( T ) 1;
 }
 
-template< class T >
-__cuda_callable__
-bool isNan( const T& v )
-{
-#if defined HAVE_CUDA
-   #if defined(__CUDA_ARCH__)
-      return isnan( v );
-   #else
-      #if defined (__GNUC__) && ( __GNUC__  < 5 )
-         return false;
-      #else
-         return std::isnan( v );
-      #endif
-   #endif
-#else
-   return std::isnan( v );
-#endif
-}
 template< typename Real >
 __cuda_callable__
 bool isSmall( const Real& v,
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
index 2bef7d8bdffbe4f797367af3ffb49af01fb47970..dd9562add377b02c3ab9ca91fa4804762182b46c 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
@@ -22,12 +22,7 @@ template< typename GridEntity,
           int NeighborEntityDimension,
           typename GridEntityConfig,
           bool storage = GridEntityConfig::template neighborEntityStorage< GridEntity >( NeighborEntityDimension ) >
-class NeighborGridEntityLayer{};   
-   
-template< typename GridEntity,
-          int NeighborEntityDimension,
-          typename GridEntityConfig >
-class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityConfig, true >
+class NeighborGridEntityLayer
 : public NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig >
 {
    public:
@@ -63,8 +58,9 @@ class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityCo
 };
 
 template< typename GridEntity,
-          typename GridEntityConfig >
-class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, true >
+          typename GridEntityConfig,
+          bool storage >
+class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, storage >
 {
    public:
  
@@ -93,52 +89,6 @@ class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, true >
       NeighborEntityGetterType neighborEntities;
 };
 
-template< typename GridEntity,
-          int NeighborEntityDimension,
-          typename GridEntityConfig >
-class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityConfig, false >
-: public NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig >
-{
-   public:
-      
-      typedef NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig > BaseType;      
-      typedef NeighborGridEntityGetter< GridEntity, NeighborEntityDimension > NeighborEntityGetterType;
-
-      using BaseType::getNeighborEntities;
- 
-      __cuda_callable__
-      NeighborGridEntityLayer( const GridEntity& entity )
-      : BaseType( entity )
-      {}
-
-      __cuda_callable__
-      const NeighborEntityGetterType& getNeighborEntities( const DimensionTag< NeighborEntityDimension >& tag ) const {}
- 
-      __cuda_callable__
-      void refresh( const typename GridEntity::GridType& grid,
-                    const typename GridEntity::GridType::IndexType& entityIndex ) {}
-};
-
-template< typename GridEntity,
-          typename GridEntityConfig >
-class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, false >
-{
-   public:
-      
-      typedef NeighborGridEntityGetter< GridEntity, 0 > NeighborEntityGetterType;
-         
-      __cuda_callable__
-      NeighborGridEntityLayer( const GridEntity& entity ){}
-
-      __cuda_callable__
-      const NeighborEntityGetterType& getNeighborEntities( const DimensionTag< 0 >& tag ) const {}
- 
-      __cuda_callable__
-      void refresh( const typename GridEntity::GridType& grid,
-                    const typename GridEntity::GridType::IndexType& entityIndex ) {}
-};
-
-
 
 
 template< typename GridEntity,
diff --git a/src/TNL/Solvers/IterativeSolver_impl.h b/src/TNL/Solvers/IterativeSolver_impl.h
index e65b3cdd412c899b6803b2129f193c9846c05766..333c38cd44a54478d01b439dbc27780711b44ac1 100644
--- a/src/TNL/Solvers/IterativeSolver_impl.h
+++ b/src/TNL/Solvers/IterativeSolver_impl.h
@@ -107,7 +107,7 @@ bool IterativeSolver< Real, Index> :: checkNextIteration()
 {
    this->refreshSolverMonitor();
 
-   if( isNan( this->getResidue() ) ||
+   if( std::isnan( this->getResidue() ) ||
        this->getIterations() > this->getMaxIterations()  ||
        ( this->getResidue() > this->getDivergenceResidue() && this->getIterations() >= this->getMinIterations() ) ||
        ( this->getResidue() < this->getConvergenceResidue() && this->getIterations() >= this->getMinIterations() ) )
@@ -120,7 +120,7 @@ bool
 IterativeSolver< Real, Index>::
 checkConvergence()
 {
-   if( isNan( this->getResidue() ) )
+   if( std::isnan( this->getResidue() ) )
    {
       std::cerr << std::endl << "The residue is NaN." << std::endl;
       return false;
diff --git a/src/TNL/Solvers/Linear/BICGStabL_impl.h b/src/TNL/Solvers/Linear/BICGStabL_impl.h
index 467f1a91a04c018377863b27ac85985759eb988d..6606bddd561fa8a8d5b3ce5e8855b6e41541b546 100644
--- a/src/TNL/Solvers/Linear/BICGStabL_impl.h
+++ b/src/TNL/Solvers/Linear/BICGStabL_impl.h
@@ -87,7 +87,7 @@ BICGStabL< Matrix >::solve( ConstVectorViewType b, VectorViewType x )
    }
 
    sigma[ 0 ] = r_0.lpNorm( 2.0 );
-   if( isNan( sigma[ 0 ] ) )
+   if( std::isnan( sigma[ 0 ] ) )
       throw std::runtime_error( "BiCGstab(ell): initial residue is NAN" );
 
    r_ast = r_0;
diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h
index 63fe4591f1de860e38966e9185eb9bd5ca6dd228..c6bc5ca7f494abd8922f1a0fcb45b4814277094f 100644
--- a/src/TNL/Solvers/SolverInitiator_impl.h
+++ b/src/TNL/Solvers/SolverInitiator_impl.h
@@ -16,10 +16,6 @@
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Meshes/TypeResolver/TypeResolver.h>
 #include <TNL/Solvers/BuildConfigTags.h>
-#include <TNL/Solvers/Linear/SOR.h>
-#include <TNL/Solvers/Linear/CG.h>
-#include <TNL/Solvers/Linear/BICGStab.h>
-#include <TNL/Solvers/Linear/GMRES.h>
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Meshes/DummyMesh.h>
 
@@ -190,15 +186,9 @@ class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, t
    public:
       static bool run( const Config::ParameterContainer& parameters )
       {
-         if(Communicators::MpiCommunicator::isDistributed())
-         {     
-               bool ret=SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters );
-               Communicators::MpiCommunicator::Finalize();      
-               return ret;
-         }
-         Communicators::MpiCommunicator::Finalize();
+         if( Communicators::MpiCommunicator::isDistributed() )
+            return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters );
          return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters );
-         
       }
 };
 
diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h
index a0e4f1953208edb1fc89d820571aa329e376ff5c..ef865e8b7b25b6ce426c1b0d3c2918187058918d 100644
--- a/src/TNL/Solvers/Solver_impl.h
+++ b/src/TNL/Solvers/Solver_impl.h
@@ -14,8 +14,8 @@
 #include <TNL/Solvers/SolverStarter.h>
 #include <TNL/Solvers/SolverConfig.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 
 namespace TNL {
 namespace Solvers {
@@ -34,19 +34,15 @@ run( int argc, char* argv[] )
    configDescription.addDelimiter( "Parallelization setup:" );
    Devices::Host::configSetup( configDescription );
    Devices::Cuda::configSetup( configDescription );
-   Communicators::NoDistrCommunicator::configSetup( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
-   
-   Communicators::NoDistrCommunicator::Init(argc,argv);
-   Communicators::MpiCommunicator::Init(argc,argv);
+
+   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv );
 
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return false;
 
    SolverInitiator< ProblemSetter, MeshConfig > solverInitiator;
-   bool ret= solverInitiator.run( parameters );
-
-	return ret;
+   return solverInitiator.run( parameters );
 };
 
 } // namespace Solvers
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index ac72f7a13283958fd407c4e281c1baeb4a038996..7dd7032810794f5f2ee43023472be6773fcb2de6 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -17,8 +17,8 @@
 #include <TNL/Meshes/DummyMesh.h>
 #include <TNL/Meshes/Grid.h>
 
-#include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 
 
 using namespace TNL;
@@ -51,18 +51,14 @@ void setupConfig( Config::ConfigDescription& config )
 
 int main( int argc, char* argv[] )
 {
-
    Config::ParameterContainer parameters;
    Config::ConfigDescription configDescription;
 
    setupConfig( configDescription );
-   
-   Communicators::NoDistrCommunicator::configSetup( configDescription );
    Communicators::MpiCommunicator::configSetup( configDescription );
-   
-   Communicators::NoDistrCommunicator::Init(argc,argv);
-   Communicators::MpiCommunicator::Init(argc,argv);   
- 
+
+   Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv);
+
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return EXIT_FAILURE;
 
@@ -83,9 +79,5 @@ int main( int argc, char* argv[] )
    if( ! resolveMeshType( parsedMeshType, parameters ) )
       return EXIT_FAILURE;
 
-#ifdef HAVE_MPI
-   Communicators::MpiCommunicator::Finalize();
-#endif
-      
    return EXIT_SUCCESS;
 }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
index 71cca245d18c157a98d2533c42f32b80ff16190a..ad4127dbd724d0fba9cc03bc21e065e3ae91c179 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
+++ b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt
@@ -20,7 +20,7 @@ ADD_TEST( NAME DirectionsTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/DirectionsTest${
 ADD_TEST( NAME CopyEntitesTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/CopyEntitesTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( NAME CutMeshFunctionTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/CutMeshFunctionTest${CMAKE_EXECUTABLE_SUFFIX} )
 
-if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" )
+if( BUILD_MPI )
 ADD_EXECUTABLE( DistributedGridTest_1D DistributedGridTest_1D.cpp )
    TARGET_COMPILE_OPTIONS( DistributedGridTest_1D PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( DistributedGridTest_1D
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
index bf7755e5fc3c6b1f1ced0b96bdbc0cfff7a47165..587ec807ec0ad01515832f777cd444c5c9ac386a 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp
@@ -4,6 +4,7 @@
 #ifdef HAVE_MPI  
 
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
@@ -404,7 +405,7 @@ TEST(NoMPI, NoTest)
   };
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
@@ -417,14 +418,9 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv);
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
index d78a1dff652a605806f98bea3b392f00f156d0e9..4907f1d269cfc3d7f5d205405546675f1b8d58fe 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
@@ -6,6 +6,7 @@
 #include <TNL/Devices/Host.h> 
 #include <TNL/Functions/CutMeshFunction.h>
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
@@ -684,7 +685,7 @@ TEST(NoMPI, NoTest)
   };
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
@@ -697,14 +698,9 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv);
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
index 5850c956e9f194806e8bca503f15763584516d7d..ce78b85680c69bba791d205661d918d1498ef3a0 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp
@@ -213,15 +213,13 @@ TEST(CutMeshFunction, 3D_2)
 
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
    ::testing::InitGoogleTest( &argc, argv );
-       int result= RUN_ALL_TESTS();
-       return result;
+   return RUN_ALL_TESTS();
 #else
-   
    throw GtestMissingError();
 #endif
 }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
index 52ed831d0baced3dbac35f36de9ac2719dd8363f..e8625bc3d6ab57d3ae01b23d99e68dc34792d7a2 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
@@ -121,7 +121,7 @@ TEST(XYZ, 3D )
 
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h
index d7774436c5da27645b62fd5f81f5954a154f9c28..537bcd9239ddde38785b4c462d4bf9d71c0537c7 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h
@@ -12,6 +12,7 @@
 #ifdef HAVE_MPI
 
 #include "DistributedGridIOTest.h"
+#include <TNL/Communicators/ScopedInitializer.h>
 
 TEST( DistributedGridIO, Save_1D )
 {
@@ -134,16 +135,11 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv );
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
        CommunicatorType::setRedirection( false );
        CommunicatorType::setupRedirection();
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h
index aaf5073ec86aa4961d6a40b7528c7b4ac0b73772..4e3603a7a40c36cc04d247f52148f029868dcbdf 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h
@@ -10,6 +10,7 @@
 #ifdef HAVE_MPI
 
 #include "DistributedGridIO_MPIIOTest.h"
+#include <TNL/Communicators/ScopedInitializer.h>
 
 TEST( DistributedGridMPIIO, Save_1D )
 {
@@ -131,16 +132,11 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv );
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
        CommunicatorType::setRedirection( false );
        CommunicatorType::setupRedirection();
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
index e907ecc9dcc5736dda015b807c090b23b1d6f1df..251b9f553a4fc82b7d1bb5ef768991a519aec047 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp
@@ -13,6 +13,7 @@
 #ifdef HAVE_MPI    
 
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -422,7 +423,7 @@ TEST(NoMPI, NoTest)
   };
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
@@ -435,14 +436,9 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv);
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
index 36426059363699661f637f3185382b85dda782a3..38276dd5436d83200b75f69a5b6b5221f23e1126 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp
@@ -15,6 +15,7 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
 
 #include "../../Functions/Functions.h"
@@ -1049,7 +1050,7 @@ TEST(NoMPI, NoTest)
   };
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
@@ -1062,14 +1063,9 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv);
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
index 1eecf6306c94bf570acd608e49e527da6036b780..6bbd7ad257a176f8f10f2dcefcd5315b385307ce 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp
@@ -4,6 +4,7 @@
 #ifdef HAVE_MPI    
 
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -765,7 +766,7 @@ TEST(NoMPI, NoTest)
   };
 #endif
 
-#include "../../src/UnitTests/GtestMissingError.h"
+#include "../../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_GTEST
@@ -778,14 +779,9 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv);
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
index d3a1cd55267ad569ecfcf2196a8922a0384f5e36..67098fc5db6c801410425378ab5b609778b376f3 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp
@@ -6,6 +6,7 @@
 #ifdef HAVE_MPI
 
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include "DistributedVectorFieldIO_MPIIOTestBase.h"
 
 using namespace TNL::Communicators;
@@ -102,16 +103,11 @@ int main( int argc, char* argv[] )
        delete listeners.Release(listeners.default_result_printer());
        listeners.Append(new MinimalistBufferedPrinter);
 
-       CommunicatorType::Init(argc,argv );
+       Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
        CommunicatorType::setRedirection( false );
        CommunicatorType::setupRedirection();
     #endif
-       int result= RUN_ALL_TESTS();
-
-    #ifdef HAVE_MPI
-       CommunicatorType::Finalize();
-    #endif
-       return result;
+       return RUN_ALL_TESTS();
 #else
    
    throw GtestMissingError();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 33873402d8074b5fa304e84546e1188b57d9943c..93c15ce593a9db00fc240c6263706f9c10a46ac5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,9 +1,4 @@
-set( ENABLE_CODECOVERAGE )
-
 ADD_SUBDIRECTORY( data )
-ADD_SUBDIRECTORY( benchmarks )
 #ADD_SUBDIRECTORY( unit-tests )
 ADD_SUBDIRECTORY( long-time-unit-tests )
 ADD_SUBDIRECTORY( mpi )
-
-unset( ENABLE_CODECOVERAGE )
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
deleted file mode 100644
index e53ba6878070e6b8f8d3b830ab8f64afcccbcf77..0000000000000000000000000000000000000000
--- a/tests/benchmarks/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-ADD_SUBDIRECTORY( share )
-ADD_SUBDIRECTORY( heat-equation-benchmark )
-
-IF( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu )
-    CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
-
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} )
-
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
-ELSE()
-    ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl )
-
-    ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl )
-
-    ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl )
-ENDIF()
-
-INSTALL( TARGETS
-            tnl-benchmark-blas
-            tnl-benchmark-spmv
-            tnl-benchmark-linear-solvers
-         RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/array-operations.h b/tests/benchmarks/array-operations.h
deleted file mode 100644
index 504dcc1da03a91fa87af913008f9355579e62930..0000000000000000000000000000000000000000
--- a/tests/benchmarks/array-operations.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/***************************************************************************
-                          array-operations.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/Array.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkArrayOperations( Benchmark & benchmark,
-                          const int & loops,
-                          const long & size )
-{
-    typedef Containers::Array< Real, Devices::Host, Index > HostArray;
-    typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
-    using namespace std;
-
-    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
-
-    HostArray hostArray, hostArray2;
-    CudaArray deviceArray, deviceArray2;
-    hostArray.setSize( size );
-    hostArray2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceArray.setSize( size );
-    deviceArray2.setSize( size );
-#endif
-
-    Real resultHost, resultDevice;
-
-
-    // reset functions
-    auto reset1 = [&]() {
-        hostArray.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceArray.setValue( 1.0 );
-#endif
-    };
-    auto reset2 = [&]() {
-        hostArray2.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceArray2.setValue( 1.0 );
-#endif
-    };
-    auto reset12 = [&]() {
-        reset1();
-        reset2();
-    };
-
-
-    reset12();
-
-
-    auto compareHost = [&]() {
-        resultHost = (int) hostArray == hostArray2;
-    };
-    auto compareCuda = [&]() {
-        resultDevice = (int) deviceArray == deviceArray2;
-    };
-    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", compareHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", compareCuda );
-#endif
-
-
-    auto copyAssignHostHost = [&]() {
-        hostArray = hostArray2;
-    };
-    auto copyAssignCudaCuda = [&]() {
-        deviceArray = deviceArray2;
-    };
-    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
-    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
-    // complain when compiling without CUDA
-    const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", copyAssignCudaCuda );
-#endif
-
-
-    auto copyAssignHostCuda = [&]() {
-        deviceArray = hostArray;
-    };
-    auto copyAssignCudaHost = [&]() {
-        hostArray = deviceArray;
-    };
-#ifdef HAVE_CUDA
-    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
-    benchmark.time( reset1,
-                    "CPU->GPU", copyAssignHostCuda,
-                    "GPU->CPU", copyAssignCudaHost );
-#endif
-
-
-    auto setValueHost = [&]() {
-        hostArray.setValue( 3.0 );
-    };
-    auto setValueCuda = [&]() {
-        deviceArray.setValue( 3.0 );
-    };
-    benchmark.setOperation( "setValue", datasetSize );
-    benchmark.time( reset1, "CPU", setValueHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", setValueCuda );
-#endif
-
-
-    auto setSizeHost = [&]() {
-        hostArray.setSize( size );
-    };
-    auto setSizeCuda = [&]() {
-        deviceArray.setSize( size );
-    };
-    auto resetSize1 = [&]() {
-        hostArray.reset();
-#ifdef HAVE_CUDA
-        deviceArray.reset();
-#endif
-    };
-    benchmark.setOperation( "allocation (setSize)", datasetSize );
-    benchmark.time( resetSize1, "CPU", setSizeHost );
-#ifdef HAVE_CUDA
-    benchmark.time( resetSize1, "GPU", setSizeCuda );
-#endif
-
-
-    auto resetSizeHost = [&]() {
-        hostArray.reset();
-    };
-    auto resetSizeCuda = [&]() {
-        deviceArray.reset();
-    };
-    auto setSize1 = [&]() {
-        hostArray.setSize( size );
-#ifdef HAVE_CUDA
-        deviceArray.setSize( size );
-#endif
-    };
-    benchmark.setOperation( "deallocation (reset)", datasetSize );
-    benchmark.time( setSize1, "CPU", resetSizeHost );
-#ifdef HAVE_CUDA
-    benchmark.time( setSize1, "GPU", resetSizeCuda );
-#endif
-
-    return true;
-}
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h
deleted file mode 100644
index ce5e631a6899170cfeba58911be71e5cc17eb7e6..0000000000000000000000000000000000000000
--- a/tests/benchmarks/benchmarks.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/***************************************************************************
-                          benchmarks.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <iostream>
-#include <iomanip>
-#include <map>
-#include <vector>
-
-#include <TNL/Timer.h>
-#include <TNL/String.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-const double oneGB = 1024.0 * 1024.0 * 1024.0;
-
-template< typename ComputeFunction,
-          typename ResetFunction >
-double
-timeFunction( ComputeFunction compute,
-              ResetFunction reset,
-              const int & loops )
-{
-    // the timer is constructed zero-initialized and stopped
-    Timer timer;
-
-    reset();
-    for(int i = 0; i < loops; ++i) {
-        // Explicit synchronization of the CUDA device
-        // TODO: not necessary for host computations
-#ifdef HAVE_CUDA
-        cudaDeviceSynchronize();
-#endif
-        timer.start();
-        compute();
-#ifdef HAVE_CUDA
-        cudaDeviceSynchronize();
-#endif
-        timer.stop();
-
-        reset();
-    }
-
-    return timer.getRealTime();
-}
-
-
-struct InternalError {};
-
-
-class Logging
-{
-public:
-    using MetadataElement = std::pair< const char*, String >;
-    using MetadataMap = std::map< const char*, String >;
-    using MetadataColumns = std::vector<MetadataElement>;
-
-    using HeaderElements = std::initializer_list< String >;
-    using RowElements = std::initializer_list< double >;
-
-    Logging( bool verbose = true )
-        : verbose(verbose)
-    { }
-
-    void
-    writeTitle( const String & title )
-    {
-        if( verbose )
-            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-        log << ": title = " << title << std::endl;
-    }
-
-    void
-    writeMetadata( const MetadataMap & metadata )
-    {
-        if( verbose )
-            std::cout << "properties:" << std::endl;
-
-        for( auto & it : metadata ) {
-            if( verbose )
-                std::cout << "   " << it.first << " = " << it.second << std::endl;
-            log << ": " << it.first << " = " << it.second << std::endl;
-        }
-        if( verbose )
-            std::cout << std::endl;
-    }
-
-    void
-    writeTableHeader( const String & spanningElement,
-                      const HeaderElements & subElements )
-    {
-        using namespace std;
-
-        if( verbose && header_changed ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.first;
-            }
-
-            // spanning element is printed as usual column to stdout,
-            // but is excluded from header
-           std::cout << std::setw( 15 ) << "";
-
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 ) << it;
-            }
-           std::cout << std::endl;
-
-            header_changed = false;
-        }
-
-        // initial indent string
-        header_indent = "!";
-        log << std::endl;
-        for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-        }
-
-        // dump stacked spanning columns
-        if( horizontalGroups.size() > 0 )
-            while( horizontalGroups.back().second <= 0 ) {
-                horizontalGroups.pop_back();
-                header_indent.pop_back();
-            }
-        for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-                log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-                header_indent += "!";
-            }
-        }
-
-        log << header_indent << " " << spanningElement << std::endl;
-        for( auto & it : subElements ) {
-            log << header_indent << "! " << it << std::endl;
-        }
-
-        if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second--;
-            header_indent.pop_back();
-        }
-    }
-
-    void
-    writeTableRow( const String & spanningElement,
-                   const RowElements & subElements )
-    {
-        using namespace std;
-
-        if( verbose ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.second;
-            }
-            // spanning element is printed as usual column to stdout
-           std::cout << std::setw( 15 ) << spanningElement;
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 );
-                if( it != 0.0 )std::cout << it;
-                else std::cout << "N/A";
-            }
-           std::cout << std::endl;
-        }
-
-        // only when changed (the header has been already adjusted)
-        // print each element on separate line
-        for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-        }
-
-        // benchmark data are indented
-        const String indent = "    ";
-        for( auto & it : subElements ) {
-            if( it != 0.0 ) log << indent << it << std::endl;
-            else log << indent << "N/A" << std::endl;
-        }
-    }
-
-    void
-    writeErrorMessage( const char* msg,
-                       const int & colspan = 1 )
-    {
-        // initial indent string
-        header_indent = "!";
-        log << std::endl;
-        for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-        }
-
-        // make sure there is a header column for the message
-        if( horizontalGroups.size() == 0 )
-            horizontalGroups.push_back( {"", 1} );
-
-        // dump stacked spanning columns
-        while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-        }
-        for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-                log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-                header_indent += "!";
-            }
-        }
-        if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second -= colspan;
-            header_indent.pop_back();
-        }
-
-        // only when changed (the header has been already adjusted)
-        // print each element on separate line
-        for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-        }
-        log << msg << std::endl;
-    }
-
-    void
-    closeTable()
-    {
-        log << std::endl;
-        header_indent = body_indent = "";
-        header_changed = true;
-        horizontalGroups.clear();
-    }
-
-    bool save( std::ostream & logFile )
-    {
-        closeTable();
-        logFile << log.str();
-        if( logFile.good() ) {
-            log.str() = "";
-            return true;
-        }
-        return false;
-    }
-
-protected:
-
-    // manual double -> String conversion with fixed precision
-    static String
-    _to_string( const double & num, const int & precision = 0, bool fixed = false )
-    {
-        std::stringstream str;
-        if( fixed )
-            str << std::fixed;
-        if( precision )
-            str << std::setprecision( precision );
-        str << num;
-        return String( str.str().data() );
-    }
-
-    std::stringstream log;
-    std::string header_indent;
-    std::string body_indent;
-
-    bool verbose;
-    MetadataColumns metadataColumns;
-    bool header_changed = true;
-    std::vector< std::pair< String, int > > horizontalGroups;
-};
-
-
-class Benchmark
-    : protected Logging
-{
-public:
-    using Logging::MetadataElement;
-    using Logging::MetadataMap;
-    using Logging::MetadataColumns;
-
-    Benchmark( const int & loops = 10,
-               bool verbose = true )
-        : Logging(verbose), loops(loops)
-    { }
-
-    // TODO: ensure that this is not called in the middle of the benchmark
-    // (or just remove it completely?)
-    void
-    setLoops( const int & loops )
-    {
-        this->loops = loops;
-    }
-
-    // Marks the start of a new benchmark
-    void
-    newBenchmark( const String & title )
-    {
-        closeTable();
-        writeTitle( title );
-    }
-
-    // Marks the start of a new benchmark (with custom metadata)
-    void
-    newBenchmark( const String & title,
-                  MetadataMap metadata )
-    {
-        closeTable();
-        writeTitle( title );
-        // add loops to metadata
-        metadata["loops"] = String(loops);
-        writeMetadata( metadata );
-    }
-
-    // Sets metadata columns -- values used for all subsequent rows until
-    // the next call to this function.
-    void
-    setMetadataColumns( const MetadataColumns & metadata )
-    {
-        if( metadataColumns != metadata )
-            header_changed = true;
-        metadataColumns = metadata;
-    }
-
-    // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
-    // Sets current operation -- operations expand the table vertically
-    //  - baseTime should be reset to 0.0 for most operations, but sometimes
-    //    it is useful to override it
-    //  - Order of operations inside a "Benchmark" does not matter, rows can be
-    //    easily sorted while converting to HTML.)
-    void
-    setOperation( const String & operation,
-                  const double & datasetSize = 0.0, // in GB
-                  const double & baseTime = 0.0 )
-    {
-        if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
-            metadataColumns[ 0 ].second = operation;
-        }
-        else {
-            metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
-        }
-        setOperation( datasetSize, baseTime );
-        header_changed = true;
-    }
-
-    void
-    setOperation( const double & datasetSize = 0.0,
-                  const double & baseTime = 0.0 )
-    {
-        this->datasetSize = datasetSize;
-        this->baseTime = baseTime;
-    }
-
-    // Creates new horizontal groups inside a benchmark -- increases the number
-    // of columns in the "Benchmark", implies column spanning.
-    // (Useful e.g. for SpMV formats, different configurations etc.)
-    void
-    createHorizontalGroup( const String & name,
-                           const int & subcolumns )
-    {
-        if( horizontalGroups.size() == 0 ) {
-            horizontalGroups.push_back( {name, subcolumns} );
-        }
-        else {
-            auto & last = horizontalGroups.back();
-            if( last.first != name && last.second > 0 ) {
-                horizontalGroups.push_back( {name, subcolumns} );
-            }
-            else {
-                last.first = name;
-                last.second = subcolumns;
-            }
-        }
-    }
-
-    // Times a single ComputeFunction. Subsequent calls implicitly split
-    // the current "horizontal group" into sub-columns identified by
-    // "performer", which are further split into "bandwidth", "time" and
-    // "speedup" columns.
-    // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
-    // Also terminates the recursion of the following variadic template.
-    template< typename ResetFunction,
-              typename ComputeFunction >
-    double
-    time( ResetFunction reset,
-          const String & performer,
-          ComputeFunction & compute )
-    {
-        const double time = timeFunction( compute, reset, loops );
-        const double bandwidth = datasetSize / time;
-        const double speedup = this->baseTime / time;
-        if( this->baseTime == 0.0 )
-            this->baseTime = time;
-
-        writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) );
-        writeTableRow( performer, RowElements({ bandwidth, time, speedup }) );
-
-        return this->baseTime;
-    }
-
-    // Recursive template function to deal with multiple computations with the
-    // same reset function.
-    template< typename ResetFunction,
-              typename ComputeFunction,
-              typename... NextComputations >
-    inline double
-    time( ResetFunction reset,
-          const String & performer,
-          ComputeFunction & compute,
-          NextComputations & ... nextComputations )
-    {
-        time( reset, performer, compute );
-        time( reset, nextComputations... );
-        return this->baseTime;
-    }
-
-    // Adds an error message to the log. Should be called in places where the
-    // "time" method could not be called (e.g. due to failed allocation).
-    void
-    addErrorMessage( const char* msg,
-                     const int & numberOfComputations = 1 )
-    {
-        // each computation has 3 subcolumns
-        const int colspan = 3 * numberOfComputations;
-        writeErrorMessage( msg, colspan );
-    }
-
-    using Logging::save;
-
-protected:
-    int loops;
-    double datasetSize = 0.0;
-    double baseTime = 0.0;
-};
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h
deleted file mode 100644
index cc957c7278ec9b21a418e627af9063e32bec3785..0000000000000000000000000000000000000000
--- a/tests/benchmarks/spmv.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/***************************************************************************
-                          spmv.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/List.h>
-#include <TNL/Pointers/DevicePointer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-// silly alias to match the number of template parameters with other formats
-template< typename Real, typename Device, typename Index >
-using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
-
-template< typename Matrix >
-int setHostTestMatrix( Matrix& matrix,
-                       const int elementsPerRow )
-{
-    const int size = matrix.getRows();
-    int elements( 0 );
-    for( int row = 0; row < size; row++ ) {
-        int col = row - elementsPerRow / 2;
-        for( int element = 0; element < elementsPerRow; element++ ) {
-            if( col + element >= 0 &&
-                col + element < size )
-            {
-                matrix.setElement( row, col + element, element + 1 );
-                elements++;
-            }
-        }
-    }
-    return elements;
-}
-
-#ifdef HAVE_CUDA
-template< typename Matrix >
-__global__ void setCudaTestMatrixKernel( Matrix* matrix,
-                                         const int elementsPerRow,
-                                         const int gridIdx )
-{
-    const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    if( rowIdx >= matrix->getRows() )
-        return;
-    int col = rowIdx - elementsPerRow / 2;
-    for( int element = 0; element < elementsPerRow; element++ ) {
-        if( col + element >= 0 &&
-            col + element < matrix->getColumns() )
-           matrix->setElementFast( rowIdx, col + element, element + 1 );
-    }
-}
-#endif
-
-template< typename Matrix >
-void setCudaTestMatrix( Matrix& matrix,
-                        const int elementsPerRow )
-{
-#ifdef HAVE_CUDA
-    typedef typename Matrix::IndexType IndexType;
-    typedef typename Matrix::RealType RealType;
-    Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
-    dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
-    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-    const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
-    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
-        if( gridIdx == cudaGrids - 1 )
-            cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
-        setCudaTestMatrixKernel< Matrix >
-            <<< cudaGridSize, cudaBlockSize >>>
-            ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
-        TNL_CHECK_CUDA_DEVICE;
-    }
-#endif
-}
-
-
-// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h
-template< typename Real,
-          template< typename, typename, typename > class Matrix,
-          template< typename, typename, typename > class Vector = Containers::Vector >
-bool
-benchmarkSpMV( Benchmark & benchmark,
-               const int & loops,
-               const int & size,
-               const int elementsPerRow = 5 )
-{
-    typedef Matrix< Real, Devices::Host, int > HostMatrix;
-    typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
-    typedef Containers::Vector< Real, Devices::Host, int > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
-
-    HostMatrix hostMatrix;
-    DeviceMatrix deviceMatrix;
-    Containers::Vector< int, Devices::Host, int > hostRowLengths;
-    Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
-    HostVector hostVector, hostVector2;
-    CudaVector deviceVector, deviceVector2;
-
-    // create benchmark group
-    Containers::List< String > parsedType;
-    parseObjectType( HostMatrix::getType(), parsedType );
-    benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
-
-    hostRowLengths.setSize( size );
-    hostMatrix.setDimensions( size, size );
-    hostVector.setSize( size );
-    hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceRowLengths.setSize( size );
-    deviceMatrix.setDimensions( size, size );
-    deviceVector.setSize( size );
-    deviceVector2.setSize( size );
-#endif
-
-    hostRowLengths.setValue( elementsPerRow );
-#ifdef HAVE_CUDA
-    deviceRowLengths.setValue( elementsPerRow );
-#endif
-
-    hostMatrix.setCompressedRowLengths( hostRowLengths );
-#ifdef HAVE_CUDA
-    deviceMatrix.setCompressedRowLengths( deviceRowLengths );
-#endif
-
-    const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
-    setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
-    const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
-
-    // reset function
-    auto reset = [&]() {
-        hostVector.setValue( 1.0 );
-        hostVector2.setValue( 0.0 );
-#ifdef HAVE_CUDA
-        deviceVector.setValue( 1.0 );
-        deviceVector2.setValue( 0.0 );
-#endif
-    };
-
-    // compute functions
-    auto spmvHost = [&]() {
-        hostMatrix.vectorProduct( hostVector, hostVector2 );
-    };
-    auto spmvCuda = [&]() {
-        deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-    };
-
-    benchmark.setOperation( datasetSize );
-    benchmark.time( reset, "CPU", spmvHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset, "GPU", spmvCuda );
-#endif
-
-    return true;
-}
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkSpmvSynthetic( Benchmark & benchmark,
-                        const int & loops,
-                        const int & size,
-                        const int & elementsPerRow )
-{
-    bool result = true;
-    // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
-    result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow );
-    result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow );
-    return result;
-}
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/benchmarks/tnl-benchmark-blas.h b/tests/benchmarks/tnl-benchmark-blas.h
deleted file mode 100644
index 8c71209534894d10a430bae874035c9d14d7bbc3..0000000000000000000000000000000000000000
--- a/tests/benchmarks/tnl-benchmark-blas.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/***************************************************************************
-                          tnl-benchmark-blas.h  -  description
-                             -------------------
-    begin                : Jan 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
-#include <TNL/Devices/SystemInfo.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-
-#include "array-operations.h"
-#include "vector-operations.h"
-#include "spmv.h"
-
-using namespace TNL;
-using namespace TNL::benchmarks;
-
-
-// TODO: should benchmarks check the result of the computation?
-
-
-template< typename Real >
-void
-runBlasBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
-                   const std::size_t & minSize,
-                   const std::size_t & maxSize,
-                   const double & sizeStepFactor,
-                   const unsigned & loops,
-                   const unsigned & elementsPerRow )
-{
-    const String precision = getType< Real >();
-    metadata["precision"] = precision;
-
-    // Array operations
-    benchmark.newBenchmark( String("Array operations (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-           {"size", size},
-        } ));
-        benchmarkArrayOperations< Real >( benchmark, loops, size );
-    }
-
-    // Vector operations
-    benchmark.newBenchmark( String("Vector operations (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-           {"size", size},
-        } ));
-        benchmarkVectorOperations< Real >( benchmark, loops, size );
-    }
-
-    // Sparse matrix-vector multiplication
-    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
-                            metadata );
-    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-        benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-            {"rows", size},
-            {"columns", size},
-            {"elements per row", elementsPerRow},
-        } ));
-        benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow );
-    }
-}
-
-void
-setupConfig( Config::ConfigDescription & config )
-{
-    config.addDelimiter( "Benchmark settings:" );
-    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
-    config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
-    config.addEntryEnum( "append" );
-    config.addEntryEnum( "overwrite" );
-    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
-    config.addEntryEnum( "float" );
-    config.addEntryEnum( "double" );
-    config.addEntryEnum( "all" );
-    config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 );
-    config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 );
-    config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
-    config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-    config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
-    config.addEntry< int >( "verbose", "Verbose mode.", 1 );
-
-    config.addDelimiter( "Device settings:" );
-    Devices::Host::configSetup( config );
-    Devices::Cuda::configSetup( config );
-}
-
-int
-main( int argc, char* argv[] )
-{
-    Config::ParameterContainer parameters;
-    Config::ConfigDescription conf_desc;
-
-    setupConfig( conf_desc );
-
-    if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
-        conf_desc.printUsage( argv[ 0 ] );
-        return 1;
-    }
-
-    Devices::Host::setup( parameters );
-    Devices::Cuda::setup( parameters );
-
-    const String & logFileName = parameters.getParameter< String >( "log-file" );
-    const String & outputMode = parameters.getParameter< String >( "output-mode" );
-    const String & precision = parameters.getParameter< String >( "precision" );
-    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
-    // which have a default value. The workaround below works for int values, but it is not possible
-    // to pass 64-bit integer values
-//    const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
-//    const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
-    const std::size_t minSize = parameters.getParameter< int >( "min-size" );
-    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
-    const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
-    const unsigned loops = parameters.getParameter< unsigned >( "loops" );
-    const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" );
-    const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
-
-    if( sizeStepFactor <= 1 ) {
-        std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    // open log file
-    auto mode = std::ios::out;
-    if( outputMode == "append" )
-        mode |= std::ios::app;
-    std::ofstream logFile( logFileName.getString(), mode );
-
-    // init benchmark and common metadata
-    Benchmark benchmark( loops, verbose );
-
-    // prepare global metadata
-    const int cpu_id = 0;
-    Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id );
-    String cacheInfo = String( cacheSizes.L1data ) + ", "
-                        + String( cacheSizes.L1instruction ) + ", "
-                        + String( cacheSizes.L2 ) + ", "
-                        + String( cacheSizes.L3 );
-#ifdef HAVE_CUDA
-    const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
-    const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
-                              String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
-#endif
-    Benchmark::MetadataMap metadata {
-        { "host name", Devices::SystemInfo::getHostname() },
-        { "architecture", Devices::SystemInfo::getArchitecture() },
-        { "system", Devices::SystemInfo::getSystemName() },
-        { "system release", Devices::SystemInfo::getSystemRelease() },
-        { "start time", Devices::SystemInfo::getCurrentTime() },
-        { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) },
-        { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) },
-        { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) },
-        { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 },
-        { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
-#ifdef HAVE_CUDA
-        { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
-        { "GPU architecture", deviceArch },
-        { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
-        { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 },
-        { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
-        { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
-        { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
-#endif
-    };
-
-    if( precision == "all" || precision == "float" )
-        runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
-    if( precision == "all" || precision == "double" )
-        runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
-
-    if( ! benchmark.save( logFile ) ) {
-        std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h
deleted file mode 100644
index cdf2443964a5bb33354de6e0ec0688018c7128a1..0000000000000000000000000000000000000000
--- a/tests/benchmarks/vector-operations.h
+++ /dev/null
@@ -1,442 +0,0 @@
-/***************************************************************************
-                          vector-operations.h  -  description
-                             -------------------
-    begin                : Dec 30, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <stdlib.h> // srand48
-
-#include "benchmarks.h"
-
-#include <TNL/Containers/Vector.h>
-
-#ifdef HAVE_CUDA
-#include "cublasWrappers.h"
-#endif
-
-namespace TNL
-{
-namespace benchmarks
-{
-
-template< typename Real = double,
-          typename Index = int >
-bool
-benchmarkVectorOperations( Benchmark & benchmark,
-                           const int & loops,
-                           const long & size )
-{
-    typedef Containers::Vector< Real, Devices::Host, Index > HostVector;
-    typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector;
-    using namespace std;
-
-    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
-
-    HostVector hostVector, hostVector2;
-    CudaVector deviceVector, deviceVector2;
-    hostVector.setSize( size );
-    hostVector2.setSize( size );
-#ifdef HAVE_CUDA
-    deviceVector.setSize( size );
-    deviceVector2.setSize( size );
-#endif
-
-    Real resultHost, resultDevice;
-
-#ifdef HAVE_CUDA
-    cublasHandle_t cublasHandle;
-    cublasCreate( &cublasHandle );
-#endif
-
-
-    // reset functions
-    // (Make sure to always use some in benchmarks, even if it's not necessary
-    // to assure correct result - it helps to clear cache and avoid optimizations
-    // of the benchmark loop.)
-    auto reset1 = [&]() {
-        hostVector.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceVector.setValue( 1.0 );
-#endif
-        // A relatively harmless call to keep the compiler from realizing we
-        // don't actually do any useful work with the result of the reduciton.
-        srand48(resultHost);
-        resultHost = resultDevice = 0.0;
-    };
-    auto reset2 = [&]() {
-        hostVector2.setValue( 1.0 );
-#ifdef HAVE_CUDA
-        deviceVector2.setValue( 1.0 );
-#endif
-    };
-    auto reset12 = [&]() {
-        reset1();
-        reset2();
-    };
-
-
-    reset12();
-
-
-    auto maxHost = [&]() {
-        resultHost = hostVector.max();
-    };
-    auto maxHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionMax< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto maxCuda = [&]() {
-        resultDevice = deviceVector.max();
-    };
-    benchmark.setOperation( "max", datasetSize );
-    benchmark.time( reset1, "CPU", maxHost );
-    benchmark.time( reset1, "CPU (general)", maxHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", maxCuda );
-#endif
-
-
-    auto minHost = [&]() {
-        resultHost = hostVector.min();
-    };
-    auto minHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionMin< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto minCuda = [&]() {
-        resultDevice = deviceVector.min();
-    };
-    benchmark.setOperation( "min", datasetSize );
-    benchmark.time( reset1, "CPU", minHost );
-    benchmark.time( reset1, "CPU (general)", minHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", minCuda );
-#endif
-
-
-    auto absMaxHost = [&]() {
-        resultHost = hostVector.absMax();
-    };
-    auto absMaxHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto absMaxCuda = [&]() {
-        resultDevice = deviceVector.absMax();
-    };
-#ifdef HAVE_CUDA
-    auto absMaxCublas = [&]() {
-        int index = 0;
-        cublasIgamax( cublasHandle, size,
-                      deviceVector.getData(), 1,
-                      &index );
-        resultDevice = deviceVector.getElement( index );
-    };
-#endif
-    benchmark.setOperation( "absMax", datasetSize );
-    benchmark.time( reset1, "CPU", absMaxHost );
-    benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", absMaxCuda );
-    benchmark.time( reset1, "cuBLAS", absMaxCublas );
-#endif
-
-
-    auto absMinHost = [&]() {
-        resultHost = hostVector.absMin();
-    };
-    auto absMinHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto absMinCuda = [&]() {
-        resultDevice = deviceVector.absMin();
-    };
-#ifdef HAVE_CUDA
-    auto absMinCublas = [&]() {
-        int index = 0;
-        cublasIgamin( cublasHandle, size,
-                      deviceVector.getData(), 1,
-                      &index );
-        resultDevice = deviceVector.getElement( index );
-    };
-#endif
-    benchmark.setOperation( "absMin", datasetSize );
-    benchmark.time( reset1, "CPU", absMinHost );
-    benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", absMinCuda );
-    benchmark.time( reset1, "cuBLAS", absMinCublas );
-#endif
-
-
-    auto sumHost = [&]() {
-        resultHost = hostVector.sum();
-    };
-    auto sumHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionSum< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto sumCuda = [&]() {
-        resultDevice = deviceVector.sum();
-    };
-    benchmark.setOperation( "sum", datasetSize );
-    benchmark.time( reset1, "CPU", sumHost );
-    benchmark.time( reset1, "CPU (general)", sumHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", sumCuda );
-#endif
-
-
-    auto l1normHost = [&]() {
-        resultHost = hostVector.lpNorm( 1.0 );
-    };
-    auto l1normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l1normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 1.0 );
-    };
-#ifdef HAVE_CUDA
-    auto l1normCublas = [&]() {
-        cublasGasum( cublasHandle, size,
-                     deviceVector.getData(), 1,
-                     &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "l1 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l1normHost );
-    benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l1normCuda );
-    benchmark.time( reset1, "cuBLAS", l1normCublas );
-#endif
-
-
-    auto l2normHost = [&]() {
-        resultHost = hostVector.lpNorm( 2.0 );
-    };
-    auto l2normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l2normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 2.0 );
-    };
-#ifdef HAVE_CUDA
-    auto l2normCublas = [&]() {
-        cublasGnrm2( cublasHandle, size,
-                     deviceVector.getData(), 1,
-                     &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "l2 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l2normHost );
-    benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l2normCuda );
-    benchmark.time( reset1, "cuBLAS", l2normCublas );
-#endif
-
-
-    auto l3normHost = [&]() {
-        resultHost = hostVector.lpNorm( 3.0 );
-    };
-    auto l3normHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
-        operation.setPower( 3.0 );
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-        return result;
-    };
-    auto l3normCuda = [&]() {
-        resultDevice = deviceVector.lpNorm( 3.0 );
-    };
-    benchmark.setOperation( "l3 norm", datasetSize );
-    benchmark.time( reset1, "CPU", l3normHost );
-    benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", l3normCuda );
-#endif
-
-
-    auto scalarProductHost = [&]() {
-        resultHost = hostVector.scalarProduct( hostVector2 );
-    };
-    auto scalarProductHostGeneral = [&]() {
-        Real result( 0 );
-        Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
-        Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              hostVector2.getData(),
-              result );
-        return result;
-    };
-    auto scalarProductCuda = [&]() {
-        resultDevice = deviceVector.scalarProduct( deviceVector2 );
-    };
-#ifdef HAVE_CUDA
-    auto scalarProductCublas = [&]() {
-        cublasGdot( cublasHandle, size,
-                    deviceVector.getData(), 1,
-                    deviceVector2.getData(), 1,
-                    &resultDevice );
-    };
-#endif
-    benchmark.setOperation( "scalar product", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", scalarProductHost );
-    benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", scalarProductCuda );
-    benchmark.time( reset1, "cuBLAS", scalarProductCublas );
-#endif
-
-    /*
-   std::cout << "Benchmarking prefix-sum:" << std::endl;
-    timer.reset();
-    timer.start();
-    hostVector.computePrefixSum();
-    timer.stop();
-    timeHost = timer.getTime();
-    bandwidth = 2 * datasetSize / loops / timer.getTime();
-   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-
-    timer.reset();
-    timer.start();
-    deviceVector.computePrefixSum();
-    timer.stop();
-    timeDevice = timer.getTime();
-    bandwidth = 2 * datasetSize / loops / timer.getTime();
-   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
-
-    HostVector auxHostVector;
-    auxHostVector.setLike( deviceVector );
-    auxHostVector = deviceVector;
-    for( int i = 0; i < size; i++ )
-       if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
-       {
-          std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
-       }
-    */
-
-
-    auto multiplyHost = [&]() {
-        hostVector *= 0.5;
-    };
-    auto multiplyCuda = [&]() {
-        deviceVector *= 0.5;
-    };
-#ifdef HAVE_CUDA
-    auto multiplyCublas = [&]() {
-        const Real alpha = 0.5;
-        cublasGscal( cublasHandle, size,
-                     &alpha,
-                     deviceVector.getData(), 1 );
-    };
-#endif
-    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-    benchmark.time( reset1, "CPU", multiplyHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", multiplyCuda );
-    benchmark.time( reset1, "cuBLAS", multiplyCublas );
-#endif
-
-
-    auto addVectorHost = [&]() {
-        hostVector.addVector( hostVector2 );
-    };
-    auto addVectorCuda = [&]() {
-        deviceVector.addVector( deviceVector2 );
-    };
-#ifdef HAVE_CUDA
-    auto addVectorCublas = [&]() {
-        const Real alpha = 1.0;
-        cublasGaxpy( cublasHandle, size,
-                     &alpha,
-                     deviceVector2.getData(), 1,
-                     deviceVector.getData(), 1 );
-    };
-#endif
-    benchmark.setOperation( "vector addition", 3 * datasetSize );
-    benchmark.time( reset1, "CPU", addVectorHost );
-#ifdef HAVE_CUDA
-    benchmark.time( reset1, "GPU", addVectorCuda );
-    benchmark.time( reset1, "cuBLAS", addVectorCublas );
-#endif
-
-
-#ifdef HAVE_CUDA
-    cublasDestroy( cublasHandle );
-#endif
-
-    return true;
-}
-
-} // namespace benchmarks
-} // namespace tnl
diff --git a/tests/mpi/GPUmeshFunctionEvaluateTest.cu b/tests/mpi/GPUmeshFunctionEvaluateTest.cu
index 1008b4a4c04dd9edebebd62e4044e34d80587b11..1dcdb95207bbe4896e8a55311658edd1b7d9d440 100644
--- a/tests/mpi/GPUmeshFunctionEvaluateTest.cu
+++ b/tests/mpi/GPUmeshFunctionEvaluateTest.cu
@@ -9,6 +9,7 @@
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -56,7 +57,7 @@ int main ( int argc, char *argv[])
   typedef LinearFunction<double,DIMENSION> LinearFunctionType;
   typedef ConstFunction<double,DIMENSION> ConstFunctionType;
   
-  CommunicatorType::Init(argc,argv);
+  Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
 
   int size=10;
   int cycles=1;
@@ -165,12 +166,8 @@ int main ( int argc, char *argv[])
     cout <<"sync: "<<sync.getRealTime()<<endl;
     cout<<"all: "<<all.getRealTime()<<endl<<endl;
   }
-  
-
-  CommunicatorType::Finalize();
 
   return 0;
-
 }
 
 #else
diff --git a/tests/mpi/MeshFunctionEvaluateTest.cpp b/tests/mpi/MeshFunctionEvaluateTest.cpp
index 6a59ae85ce373c05839f1861be31c61d4377f713..3c06a34cb7e9713b253e8dd1f11dce1f794381a7 100644
--- a/tests/mpi/MeshFunctionEvaluateTest.cpp
+++ b/tests/mpi/MeshFunctionEvaluateTest.cpp
@@ -16,6 +16,7 @@ using namespace std;
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h>
@@ -62,7 +63,7 @@ int main ( int argc, char *argv[])
    typedef LinearFunction<double,DIMENSION> LinearFunctionType;
    typedef ConstFunction<double,DIMENSION> ConstFunctionType;
   
-   CommunicatorType::Init(argc,argv);
+   Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
 
    int size=9;
    int cycles=1;
@@ -173,7 +174,6 @@ int main ( int argc, char *argv[])
       cout <<"sync: "<<sync.getRealTime()<<endl;
       cout<<"all: "<<all.getRealTime()<<endl<<endl;
    }
-   CommunicatorType::Finalize();
 #else
   std::cout<<"MPI not Supported." << std::endl;
 #endif
diff --git a/tests/mpi/mpiio-save-load-test.cpp b/tests/mpi/mpiio-save-load-test.cpp
index 0e7a8dff2ae08c4f23b59f9f2ebc04f043446ad4..0fa7ee7f65d8756779d2ffb0548eadfc3d9c4b90 100644
--- a/tests/mpi/mpiio-save-load-test.cpp
+++ b/tests/mpi/mpiio-save-load-test.cpp
@@ -2,6 +2,7 @@
 #define MPIIO
 
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
@@ -38,7 +39,7 @@ int main(int argc, char **argv)
         typedef typename DistributedGridType::CoordinatesType CoordinatesType;
         typedef LinearFunction<double,DIM> LinearFunctionType;
 
-        CommunicatorType::Init(argc, argv);
+        Communicators::ScopedInitializer< CommunicatorType > mpi_init(argc, argv);
 
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
         MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
@@ -92,9 +93,6 @@ int main(int argc, char **argv)
             else
                 std::cout <<"Ok!"<<std::endl;
         }
-
-        CommunicatorType::Finalize();
-
 }
 
 #else
diff --git a/tests/mpi/mpiio-save-test.h b/tests/mpi/mpiio-save-test.h
index 6e14b537e5ca3e5c7a70a4b967d6a7a522beeb4e..a824bd5b74c917f8d65de00a57b90526e536424e 100644
--- a/tests/mpi/mpiio-save-test.h
+++ b/tests/mpi/mpiio-save-test.h
@@ -2,6 +2,7 @@
 
 #define MPIIO
 #include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
@@ -38,7 +39,7 @@ int main(int argc, char **argv)
         typedef typename DistributedGridType::CoordinatesType CoordinatesType;
         typedef LinearFunction<double,DIM> LinearFunctionType;
 
-        CommunicatorType::Init(argc, argv);
+        Communicators::ScopedInitializer< CommunicatorType > mpi_init(argc, argv);
 
         Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
         MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
@@ -81,9 +82,6 @@ int main(int argc, char **argv)
         
         String fileName=String("./meshFunction.tnl");
         DistributedGridIO<MeshFunctionType,MpiIO> ::save(fileName, *meshFunctionptr );
-
-        CommunicatorType::Finalize();
-
 }
 
 #else