diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ed064eb1de0ebce012daf328173b420ada0f0a4..fc377eee6cdbe9dfbbf7710ebfc4038a8d08417a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -409,10 +409,6 @@ endif()
 #   endif()
 #endif()
 
-if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
-   AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
-endif()
-
 CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
 INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )
 
diff --git a/build b/build
index e0c8dbb993e592c420aa62abd991c189fbff4870..3e2983dcee398c1af61f36ab34166bbf71f11620 100755
--- a/build
+++ b/build
@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
 INSTANTIATE_LONG_DOUBLE="no"
 INSTANTIATE_DOUBLE="yes"
 INSTANTIATE_FLOAT="no"
-OPTIMIZED_VECTOR_HOST_OPERATIONS="no"
 
 for option in "$@"
 do
@@ -75,7 +74,6 @@ do
                                            INSTANTIATE_DOUBLE="yes"
                                            INSTANTIATE_FLOAT="no"
                                            WITH_CUDA_ARCH="auto" ;;
-        --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
         *                                ) 
            echo "Unknown option ${option}. Use --help for more information."
            exit 1 ;;
@@ -175,7 +173,6 @@ cmake_command=(
          -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
          -DINSTANTIATE_INT=${INSTANTIATE_INT}
          -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
-         -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
 )
 
 # Skip running cmake if it was already run and the cmake command is the same.
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index e65f8980b1066e042206e328d15b50e32c81432f..e5b6f5aee648ac89f1ec44656a3ce67b8043c24d 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto maxHost = [&]() {
       resultHost = hostVector.max();
    };
-   auto maxHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionMax< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto maxCuda = [&]() {
       resultDevice = deviceVector.max();
    };
    benchmark.setOperation( "max", datasetSize );
    benchmark.time( reset1, "CPU", maxHost );
-   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", maxCuda );
 #endif
@@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto minHost = [&]() {
       resultHost = hostVector.min();
    };
-   auto minHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionMin< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto minCuda = [&]() {
       resultDevice = deviceVector.min();
    };
    benchmark.setOperation( "min", datasetSize );
    benchmark.time( reset1, "CPU", minHost );
-   benchmark.time( reset1, "CPU (general)", minHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", minCuda );
 #endif
@@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMaxHost = [&]() {
       resultHost = hostVector.absMax();
    };
-   auto absMaxHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto absMaxCuda = [&]() {
       resultDevice = deviceVector.absMax();
    };
@@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "absMax", datasetSize );
    benchmark.time( reset1, "CPU", absMaxHost );
-   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", absMaxCuda );
    benchmark.time( reset1, "cuBLAS", absMaxCublas );
@@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMinHost = [&]() {
       resultHost = hostVector.absMin();
    };
-   auto absMinHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto absMinCuda = [&]() {
       resultDevice = deviceVector.absMin();
    };
@@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "absMin", datasetSize );
    benchmark.time( reset1, "CPU", absMinHost );
-   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", absMinCuda );
    benchmark.time( reset1, "cuBLAS", absMinCublas );
@@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto sumHost = [&]() {
       resultHost = hostVector.sum();
    };
-   auto sumHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionSum< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto sumCuda = [&]() {
       resultDevice = deviceVector.sum();
    };
    benchmark.setOperation( "sum", datasetSize );
    benchmark.time( reset1, "CPU", sumHost );
-   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", sumCuda );
 #endif
@@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l1normHost = [&]() {
       resultHost = hostVector.lpNorm( 1.0 );
    };
-   auto l1normHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto l1normCuda = [&]() {
       resultDevice = deviceVector.lpNorm( 1.0 );
    };
@@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
    benchmark.time( reset1, "CPU", l1normHost );
-   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", l1normCuda );
    benchmark.time( reset1, "cuBLAS", l1normCublas );
@@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l2normHost = [&]() {
       resultHost = hostVector.lpNorm( 2.0 );
    };
-   auto l2normHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto l2normCuda = [&]() {
       resultDevice = deviceVector.lpNorm( 2.0 );
    };
@@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
    benchmark.time( reset1, "CPU", l2normHost );
-   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", l2normCuda );
    benchmark.time( reset1, "cuBLAS", l2normCublas );
@@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l3normHost = [&]() {
       resultHost = hostVector.lpNorm( 3.0 );
    };
-   auto l3normHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
-      operation.setPower( 3.0 );
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              ( Real* ) 0,
-              result );
-      return result;
-   };
    auto l3normCuda = [&]() {
       resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmark.setOperation( "l3 norm", datasetSize );
    benchmark.time( reset1, "CPU", l3normHost );
-   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", l3normCuda );
 #endif
@@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto scalarProductHost = [&]() {
       resultHost = hostVector.scalarProduct( hostVector2 );
    };
-   auto scalarProductHostGeneral = [&]() {
-      Real result( 0 );
-      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
-      Containers::Algorithms::Reduction< Devices::Host >::reduce(
-              operation,
-              hostVector.getSize(),
-              hostVector.getData(),
-              hostVector2.getData(),
-              result );
-      return result;
-   };
    auto scalarProductCuda = [&]() {
       resultDevice = deviceVector.scalarProduct( deviceVector2 );
    };
@@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
    benchmark.time( reset1, "CPU", scalarProductHost );
-   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
 #ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", scalarProductCuda );
    benchmark.time( reset1, "cuBLAS", scalarProductCublas );
diff --git a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
index 6a29ec9380f0d5b04d31f42b431ffb93d66bdd68..e63e431c67a39eed8a06993f71605db48d8e39c5 100644
--- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
+++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
                     cell.getBasis(),
                     gridXIdx,
                     gridYIdx );
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
          
          //std::cerr << "Computing the heat equation ..." << std::endl;
@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
                     cell.getBasis(),
                     gridXIdx,
                     gridYIdx );
-         cudaThreadSynchronize();         
+         cudaDeviceSynchronize();         
          TNL_CHECK_CUDA_DEVICE;
       }
       if( this->cudaKernelType == "templated" )
diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
index 4d8af10804463cfec56226ec56198fb456d69177..4b546be64fdd4f2b225bf5a358d13c5e851a7d89 100644
--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h
@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
    while( iteration < 10000 )
    {
       testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       iteration++;
    }
    auto t_stop = std::chrono::high_resolution_clock::now();   
diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
index e8798609898c03831a165b9173dccadffdd4a1ba..33dff1ded6f5968f812236d2dd65b23a2f1c2fc9 100644
--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
@@ -379,7 +379,7 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
          return false;
       }            
       
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
       if( ( cudaErr = cudaGetLastError() ) != cudaSuccess )
       {
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index d5ad82ecbc2c5b6709b15fd38ae7d1465e919ee6..ad3c8d61c0ef09f084bf4b5ba392224ab2786ea3 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -250,7 +250,7 @@ double benchmarkMatrix( const Matrix& matrix,
       matrix.vectorProduct( x, b );
 #ifdef HAVE_CUDA
       if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value )
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
 #endif
       time = timer.getRealTime();
       iterations++;
diff --git a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
index cfb271d7d7c4090cb6fcfcb67766d0115757d1d9..8f6d376fe27ebed3cd67307bf8f24ea2c5d630d4 100644
--- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
+++ b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h
@@ -60,7 +60,7 @@ class CusparseCSRBase
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         TNL_ASSERT( matrix, );
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
 #ifdef HAVE_CUDA
          cusparseDcsrmv( *( this->cusparseHandle ),
                          CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -103,7 +103,7 @@ class CusparseCSR< double > : public CusparseCSRBase< double >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         TNL_ASSERT( matrix, "" );
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
 #ifdef HAVE_CUDA  
 	 double d = 1.0;       
          double* alpha = &d;
@@ -134,7 +134,7 @@ class CusparseCSR< float > : public CusparseCSRBase< float >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         TNL_ASSERT( matrix, "" );
+         TNL_ASSERT_TRUE( matrix, "matrix was not initialized" );
 #ifdef HAVE_CUDA         
          float d = 1.0;       
          float* alpha = &d;
diff --git a/src/Python/pytnl/tnl/SparseMatrix.h b/src/Python/pytnl/tnl/SparseMatrix.h
index 7f8aa040478eec2b3628d7fab6262c415d083ec0..6788d1a68ffd738aef1ca395af9d88b3082b98bc 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.h
+++ b/src/Python/pytnl/tnl/SparseMatrix.h
@@ -51,6 +51,8 @@ void export_Matrix( py::module & m, const char* name )
 
     using VectorType = TNL::Containers::Vector< typename Matrix::RealType, typename Matrix::DeviceType, typename Matrix::IndexType >;
 
+    void (Matrix::* _getCompressedRowLengths)(typename Matrix::CompressedRowLengthsVector&) const = &Matrix::getCompressedRowLengths;
+
     auto matrix = py::class_< Matrix, TNL::Object >( m, name )
         .def(py::init<>())
         // overloads (defined in Object)
@@ -69,7 +71,7 @@ void export_Matrix( py::module & m, const char* name )
         .def("setDimensions",           &Matrix::setDimensions)
         .def("setCompressedRowLengths", &Matrix::setCompressedRowLengths)
         .def("getRowLength",            &Matrix::getRowLength)
-        .def("getCompressedRowLengths", &Matrix::getCompressedRowLengths)
+        .def("getCompressedRowLengths", _getCompressedRowLengths)
         // TODO: export for more types
         .def("setLike",                 &Matrix::template setLike< typename Matrix::RealType, typename Matrix::DeviceType, typename Matrix::IndexType >)
         .def("getNumberOfMatrixElements", &Matrix::getNumberOfMatrixElements)
diff --git a/src/TNL/CMakeLists.txt b/src/TNL/CMakeLists.txt
index cd07ae65910ab69bf004f69bb84231caf78aaab3..306bd82a3c5c6633c893beafa90aa768f0e83bee 100644
--- a/src/TNL/CMakeLists.txt
+++ b/src/TNL/CMakeLists.txt
@@ -14,8 +14,6 @@ ADD_SUBDIRECTORY( Pointers )
 ADD_SUBDIRECTORY( Problems )
 ADD_SUBDIRECTORY( Solvers )
 
-ADD_SUBDIRECTORY( legacy )
-
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL )
 
 set( headers
@@ -57,8 +55,6 @@ set( tnl_SOURCES ${tnl_config_SOURCES}
                  ${tnl_pointers_SOURCES}
                  ${tnl_solvers_SOURCES}
 
-                 ${tnl_legacy_SOURCES}
-
                  ${common_SOURCES} )
 
 set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES}
@@ -73,7 +69,6 @@ set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES}
                        ${tnl_problems_CUDA__SOURCES}
                        ${tnl_solvers_CUDA__SOURCES}
 
-                       ${tnl_legacy_CUDA__SOURCES}
                        ${common_SOURCES} )
 
 
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index c233004a602f31ce8b7220b9983c9541f47f6331..1ad8a6e088445fe76a0aeab3bc0bf68cd00c6943 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -314,12 +314,12 @@ class MpiCommunicator
         }
 
         template< typename T >
-        static void Bcast(  T& data, int count, int root,CommunicationGroup group)
+        static void Bcast( T* data, int count, int root, CommunicationGroup group)
         {
 #ifdef HAVE_MPI
            TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized");
            TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup");
-           MPI_Bcast((void*) &data, count,  MPIDataType(data), root, group);
+           MPI_Bcast((void*) data, count, MPIDataType(data), root, group);
 #else
            throw Exceptions::MPISupportMissing();
 #endif
@@ -340,6 +340,21 @@ class MpiCommunicator
 #endif
         }
 
+        // in-place variant of Allreduce
+        template< typename T >
+        static void Allreduce( T* data,
+                               int count,
+                               const MPI_Op &op,
+                               CommunicationGroup group)
+        {
+#ifdef HAVE_MPI
+            TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup");
+            MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPIDataType(data),op,group);
+#else
+            throw Exceptions::MPISupportMissing();
+#endif
+        }
+
 
          template< typename T >
          static void Reduce( const T* data,
diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h
index aac58b916bf17656e9d6c33bead7a4d37441fca7..33bbe01a0d289a74d74af23195ee4d7a60c87366 100644
--- a/src/TNL/Communicators/NoDistrCommunicator.h
+++ b/src/TNL/Communicators/NoDistrCommunicator.h
@@ -93,8 +93,8 @@ class NoDistrCommunicator
       {
       }
 
-      template< typename T > 
-      static void Bcast(  T& data, int count, int root, CommunicationGroup group)
+      template< typename T >
+      static void Bcast( T* data, int count, int root, CommunicationGroup group)
       {
       }
 
@@ -108,6 +108,15 @@ class NoDistrCommunicator
          memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) );
       }
 
+      // in-place variant of Allreduce
+      template< typename T >
+      static void Allreduce( T* data,
+                             int count,
+                             const MPI_Op &op,
+                             CommunicationGroup group )
+      {
+      }
+
       template< typename T >
       static void Reduce( T* data,
                           T* reduced_data,
diff --git a/src/TNL/Containers/Algorithms/ArrayOperations.h b/src/TNL/Containers/Algorithms/ArrayOperations.h
index ad852e10f78e5b4e08d7cf66abf7071f33e5e73e..47050d32fd8f037251e3fe5258c98fe4d5f90b2c 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperations.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperations.h
@@ -42,14 +42,14 @@ class ArrayOperations< Devices::Host >
       static Element getMemoryElement( const Element* data );
 
       template< typename Element, typename Index >
-      static bool setMemory( Element* data,
+      static void setMemory( Element* data,
                              const Element& value,
                              const Index size );
 
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -93,14 +93,14 @@ class ArrayOperations< Devices::Cuda >
       static Element getMemoryElement( const Element* data );
 
       template< typename Element, typename Index >
-      static bool setMemory( Element* data,
+      static void setMemory( Element* data,
                              const Element& value,
                              const Index size );
 
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -132,7 +132,7 @@ class ArrayOperations< Devices::Cuda, Devices::Host >
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -152,7 +152,7 @@ class ArrayOperations< Devices::Host, Devices::Cuda >
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -185,14 +185,14 @@ class ArrayOperations< Devices::MIC >
       static Element getMemoryElement( const Element* data );
 
       template< typename Element, typename Index >
-      static bool setMemory( Element* data,
+      static void setMemory( Element* data,
                              const Element& value,
                              const Index size );
 
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -224,7 +224,7 @@ class ArrayOperations< Devices::MIC, Devices::Host >
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
@@ -244,7 +244,7 @@ class ArrayOperations< Devices::Host, Devices::MIC >
       template< typename DestinationElement,
                 typename SourceElement,
                 typename Index >
-      static bool copyMemory( DestinationElement* destination,
+      static void copyMemory( DestinationElement* destination,
                               const SourceElement* source,
                               const Index size );
 
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
index bca6bdb0479eb38a329234f92305421873864dc5..9a87b52874f6f34f6f788362623060804052399b 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
@@ -8,9 +8,10 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#pragma once 
+#pragma once
 
 #include <iostream>
+#include <memory>
 
 #include <TNL/tnlConfig.h>
 #include <TNL/Math.h>
@@ -21,7 +22,7 @@
 #include <TNL/Containers/Algorithms/ReductionOperations.h>
 
 namespace TNL {
-namespace Containers {   
+namespace Containers {
 namespace Algorithms {
 
 template< typename Element, typename Index >
@@ -99,7 +100,7 @@ setArrayValueCudaKernel( Element* data,
 #endif
 
 template< typename Element, typename Index >
-bool
+void
 ArrayOperations< Devices::Cuda >::
 setMemory( Element* data,
            const Element& value,
@@ -112,7 +113,7 @@ setMemory( Element* data,
    Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
    gridSize. x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
    setArrayValueCudaKernel<<< gridSize, blockSize >>>( data, size, value );
-   return TNL_CHECK_CUDA_DEVICE;
+   TNL_CHECK_CUDA_DEVICE;
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -140,7 +141,7 @@ copyMemoryCudaToCudaKernel( DestinationElement* destination,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::Cuda >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -155,7 +156,7 @@ copyMemory( DestinationElement* destination,
                   source,
                   size * sizeof( DestinationElement ),
                   cudaMemcpyDeviceToDevice );
-      return TNL_CHECK_CUDA_DEVICE;
+      TNL_CHECK_CUDA_DEVICE;
    }
    else
    {
@@ -164,7 +165,7 @@ copyMemory( DestinationElement* destination,
       Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
       gridSize. x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
       copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
-      return TNL_CHECK_CUDA_DEVICE;
+      TNL_CHECK_CUDA_DEVICE;
    }
 #else
    throw Exceptions::CudaSupportMissing();
@@ -182,11 +183,8 @@ compareMemory( const Element1* destination,
 {
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
-   //TODO: The parallel reduction on the CUDA device with different element types is needed.
-   bool result = false;
    Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
-   Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );
 }
 
 template< typename Element,
@@ -200,11 +198,9 @@ containsValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
-   bool result = false;
    Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
    reductionContainsValue.setValue( value );
-   Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
 }
 
 template< typename Element,
@@ -218,11 +214,9 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
-   bool result = false;
    Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
    reductionContainsOnlyValue.setValue( value );
-   Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
 }
 
 
@@ -232,7 +226,7 @@ containsOnlyValue( const Element* data,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::Host, Devices::Cuda >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -248,23 +242,20 @@ copyMemory( DestinationElement* destination,
                       size * sizeof( DestinationElement ),
                       cudaMemcpyDeviceToHost ) != cudaSuccess )
          std::cerr << "Transfer of data from CUDA device to host failed." << std::endl;
-      return TNL_CHECK_CUDA_DEVICE;
+      TNL_CHECK_CUDA_DEVICE;
    }
    else
    {
-      SourceElement* buffer = new SourceElement[ Devices::Cuda::getGPUTransferBufferSize() ];
+      std::unique_ptr< SourceElement[] > buffer{ new SourceElement[ Devices::Cuda::getGPUTransferBufferSize() ] };
       Index i( 0 );
       while( i < size )
       {
-         if( cudaMemcpy( buffer,
-                         &source[ i ],
+         if( cudaMemcpy( (void*) buffer.get(),
+                         (void*) &source[ i ],
                          min( size - i, Devices::Cuda::getGPUTransferBufferSize() ) * sizeof( SourceElement ),
                          cudaMemcpyDeviceToHost ) != cudaSuccess )
-         {
-            delete[] buffer;
             std::cerr << "Transfer of data from CUDA device to host failed." << std::endl;
-            return TNL_CHECK_CUDA_DEVICE;
-         }
+         TNL_CHECK_CUDA_DEVICE;
          Index j( 0 );
          while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size )
          {
@@ -273,9 +264,7 @@ copyMemory( DestinationElement* destination,
          }
          i += j;
       }
-      delete[] buffer;
    }
-   return true;
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -298,28 +287,21 @@ compareMemory( const Element1* destination,
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "Array size must be non-negative." );
 #ifdef HAVE_CUDA
-   Element2* host_buffer = new Element2[ Devices::Cuda::getGPUTransferBufferSize() ];
+   std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] };
    Index compared( 0 );
    while( compared < size )
    {
       Index transfer = min( size - compared, Devices::Cuda::getGPUTransferBufferSize() );
-      if( cudaMemcpy( ( void* ) host_buffer,
-                      ( void* ) & ( source[ compared ] ),
+      if( cudaMemcpy( (void*) host_buffer.get(),
+                      (void*) &source[ compared ],
                       transfer * sizeof( Element2 ),
                       cudaMemcpyDeviceToHost ) != cudaSuccess )
-      {
-         delete[] host_buffer;
          std::cerr << "Transfer of data from CUDA device to host failed." << std::endl;
-         return TNL_CHECK_CUDA_DEVICE;
-      }
-      if( ! ArrayOperations< Devices::Host >::compareMemory( &destination[ compared ], host_buffer, transfer ) )
-      {
-         delete[] host_buffer;
+      TNL_CHECK_CUDA_DEVICE;
+      if( ! ArrayOperations< Devices::Host >::compareMemory( &destination[ compared ], host_buffer.get(), transfer ) )
          return false;
-      }
       compared += transfer;
    }
-   delete[] host_buffer;
    return true;
 #else
    throw Exceptions::CudaSupportMissing();
@@ -332,7 +314,7 @@ compareMemory( const Element1* destination,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::Cuda, Devices::Host >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -349,11 +331,11 @@ copyMemory( DestinationElement* destination,
                       size * sizeof( DestinationElement ),
                       cudaMemcpyHostToDevice ) != cudaSuccess )
          std::cerr << "Transfer of data from host to CUDA device failed." << std::endl;
-      return TNL_CHECK_CUDA_DEVICE;
+      TNL_CHECK_CUDA_DEVICE;
    }
    else
    {
-      DestinationElement* buffer = new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ];
+      std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ] };
       Index i( 0 );
       while( i < size )
       {
@@ -363,19 +345,14 @@ copyMemory( DestinationElement* destination,
             buffer[ j ] = source[ i + j ];
             j++;
          }
-         if( cudaMemcpy( &destination[ i ],
-                         buffer,
+         if( cudaMemcpy( (void*) &destination[ i ],
+                         (void*) buffer.get(),
                          j * sizeof( DestinationElement ),
                          cudaMemcpyHostToDevice ) != cudaSuccess )
-         {
-            delete[] buffer;
             std::cerr << "Transfer of data from host to CUDA device failed." << std::endl;
-            return TNL_CHECK_CUDA_DEVICE;
-         }
+         TNL_CHECK_CUDA_DEVICE;
          i += j;
       }
-      delete[] buffer;
-      return true;
    }
 #else
    throw Exceptions::CudaSupportMissing();
@@ -397,235 +374,6 @@ compareMemory( const Element1* hostData,
    return ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( deviceData, hostData, size );
 }
 
-#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
-
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< char,        int >( char*& data, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< int,         int >( int*& data, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< long int,    int >( long int*& data, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< float,       int >( float*& data, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< double,      int >( double*& data, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< long double, int >( long double*& data, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< char,        long int >( char*& data, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< int,         long int >( int*& data, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< float,       long int >( float*& data, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< char        >( char* data );
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< int         >( int* data );
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< long int    >( long int* data );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< float       >( float* data );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< double      >( double* data );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::freeMemory< long double >( long double* data );
-#endif
-
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< char        >( char* data, const char& value );
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< int         >( int* data, const int& value );
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< long int    >( long int* data, const long int& value );
-#ifdef INSTANTIATE_FLOAT
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< float       >( float* data, const float& value );
-#endif
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< double      >( double* data, const double& value );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template void ArrayOperations< Devices::Cuda >::setMemoryElement< long double >( long double* data, const long double& value );
-#endif
-
-extern template char        ArrayOperations< Devices::Cuda >::getMemoryElement< char        >( const char* data );
-extern template int         ArrayOperations< Devices::Cuda >::getMemoryElement< int         >( const int* data );
-extern template long int    ArrayOperations< Devices::Cuda >::getMemoryElement< long int    >( const long int* data );
-#ifdef INSTANTIATE_FLOAT
-extern template float       ArrayOperations< Devices::Cuda >::getMemoryElement< float       >( const float* data );
-#endif
-extern template double      ArrayOperations< Devices::Cuda >::getMemoryElement< double      >( const double* data );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template long double ArrayOperations< Devices::Cuda >::getMemoryElement< long double >( const long double* data );
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Cuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-#endif
-#endif
-
-#endif
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
index 756731ca9fe645948aa7c9f8e1634e16ebba9a17..c48d4b40bb3fc8997fe0e8adbf5c3fc1a64fb8a1 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
@@ -62,7 +62,7 @@ getMemoryElement( const Element* data )
 }
 
 template< typename Element, typename Index >
-bool
+void
 ArrayOperations< Devices::Host >::
 setMemory( Element* data,
            const Element& value,
@@ -70,13 +70,12 @@ setMemory( Element* data,
 {
    for( Index i = 0; i < size; i ++ )
       data[ i ] = value;
-   return true;
 }
 
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::Host >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -100,7 +99,6 @@ copyMemory( DestinationElement* destination,
    else
       for( Index i = 0; i < size; i ++ )
          destination[ i ] = ( DestinationElement ) source[ i ];
-   return true;
 }
 
 template< typename DestinationElement,
@@ -164,140 +162,6 @@ containsOnlyValue( const Element* data,
    return true;
 }
 
-
-#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
-
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< char,        int >( char*& data, const int size );
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< int,         int >( int*& data, const int size );
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< long int,    int >( long int*& data, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< float,       int >( float*& data, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< double,      int >( double*& data, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< long double, int >( long double*& data, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< char,        long int >( char*& data, const long int size );
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< int,         long int >( int*& data, const long int size );
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< long int,    long int >( long int*& data, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< float,       long int >( float*& data, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< double,      long int >( double*& data, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::allocateMemory< long double, long int >( long double*& data, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Host >::freeMemory< char        >( char* data );
-extern template bool ArrayOperations< Devices::Host >::freeMemory< int         >( int* data );
-extern template bool ArrayOperations< Devices::Host >::freeMemory< long int    >( long int* data );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::freeMemory< float       >( float* data );
-#endif
-extern template bool ArrayOperations< Devices::Host >::freeMemory< double      >( double* data );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::freeMemory< long double >( long double* data );
-#endif
-
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< char        >( char* data, const char& value );
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< int         >( int* data, const int& value );
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< long int    >( long int* data, const long int& value );
-#ifdef INSTANTIATE_FLOAT
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< float       >( float* data, const float& value );
-#endif
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< double      >( double* data, const double& value );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template void ArrayOperations< Devices::Host >::setMemoryElement< long double >( long double* data, const long double& value );
-#endif
-
-extern template char        ArrayOperations< Devices::Host >::getMemoryElement< char        >( char* data );
-extern template int         ArrayOperations< Devices::Host >::getMemoryElement< int         >( int* data );
-extern template long int    ArrayOperations< Devices::Host >::getMemoryElement< long int    >( long int* data );
-#ifdef INSTANTIATE_FLOAT
-extern template float       ArrayOperations< Devices::Host >::getMemoryElement< float       >( float* data );
-#endif
-extern template double      ArrayOperations< Devices::Host >::getMemoryElement< double      >( double* data );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template long double ArrayOperations< Devices::Host >::getMemoryElement< long double >( long double* data );
-#endif
-
-extern template bool ArrayOperations< Devices::Host >::copyMemory< char,                char, int >( char* destination, const char* source, const int size );
-extern template bool ArrayOperations< Devices::Host >::copyMemory< int,                  int, int >( int* destination, const int* source, const int size );
-extern template bool ArrayOperations< Devices::Host >::copyMemory< long int,        long int, int >( long int* destination, const long int* source, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::copyMemory< float,              float, int >( float* destination, const float* source, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::copyMemory< double,            double, int >( double* destination, const double* source, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::copyMemory< long double,  long double, int >( long double* destination, const long double* source, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host >::copyMemory< char,                char, long int >( char* destination, const char* source, const long int size );
-extern template bool ArrayOperations< Devices::Host >::copyMemory< int,                  int, long int >( int* destination, const int* source, const long int size );
-extern template bool ArrayOperations< Devices::Host >::copyMemory< long int,        long int, long int >( long int* destination, const long int* source, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::copyMemory< float,              float, long int >( float* destination, const float* source, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::copyMemory< double,            double, long int >( double* destination, const double* source, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::copyMemory< long double,  long double, long int >( long double* destination, const long double* source, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Host >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
-extern template bool ArrayOperations< Devices::Host >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
-extern template bool ArrayOperations< Devices::Host >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
-extern template bool ArrayOperations< Devices::Host >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
-extern template bool ArrayOperations< Devices::Host >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-#endif
-#endif
-
-extern template bool ArrayOperations< Devices::Host >::setMemory< char,        int >( char* destination, const char& value, const int size );
-extern template bool ArrayOperations< Devices::Host >::setMemory< int,         int >( int* destination, const int& value, const int size );
-extern template bool ArrayOperations< Devices::Host >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::setMemory< float,       int >( float* destination, const float& value, const int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::setMemory< double,      int >( double* destination, const double& value, const int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool ArrayOperations< Devices::Host >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
-extern template bool ArrayOperations< Devices::Host >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
-extern template bool ArrayOperations< Devices::Host >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
-#ifdef INSTANTIATE_FLOAT
-extern template bool ArrayOperations< Devices::Host >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
-#endif
-extern template bool ArrayOperations< Devices::Host >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool ArrayOperations< Devices::Host >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-#endif
-#endif
-
-#endif
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsMIC_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsMIC_impl.h
index 6a7b873531686ed0da24274ae97321a725d27f18..0289c3c6c85d482340d74a6f100ad26695294a45 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsMIC_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsMIC_impl.h
@@ -78,7 +78,7 @@ getMemoryElement( const Element* data )
 }
 
 template< typename Element, typename Index >
-bool
+void
 ArrayOperations< Devices::MIC >::
 setMemory( Element* data,
            const Element& value,
@@ -95,7 +95,6 @@ setMemory( Element* data,
        for(int i=0;i<size;i++)
            dst[i]=tmp;
    }
-   return true;
 #else
    throw Exceptions::MICSupportMissing();
 #endif
@@ -104,7 +103,7 @@ setMemory( Element* data,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::MIC >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -123,7 +122,6 @@ copyMemory( DestinationElement* destination,
          {
              memcpy(dst_ptr.pointer,src_ptr.pointer,size*sizeof(DestinationElement));
          }
-         return true;
       }
       else
       {
@@ -136,13 +134,10 @@ copyMemory( DestinationElement* destination,
              for(int i=0;i<size;i++)
                  dst_ptr.pointer[i]=src_ptr.pointer[i];
          }
-         return true;
-
       }
    #else
       throw Exceptions::MICSupportMissing();
    #endif
-      return false;
 }
 
 template< typename Element1,
@@ -242,7 +237,7 @@ containsOnlyValue( const Element* data,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::Host, Devices::MIC >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -267,7 +262,6 @@ copyMemory( DestinationElement* destination,
          }
 
          memcpy((void*)destination,(void*)&tmp,size*sizeof(SourceElement));
-         return true;
       }
       else
       {
@@ -277,7 +271,6 @@ copyMemory( DestinationElement* destination,
           {
               memcpy((void*)tmp,src_ptr.pointer,size*sizeof(SourceElement));
           }
-          return true;
       }
    }
    else
@@ -297,7 +290,6 @@ copyMemory( DestinationElement* destination,
          }
 
          memcpy((void*)destination,(void*)&tmp,size*sizeof(DestinationElement));
-         return true;
       }
       else
       {
@@ -309,10 +301,8 @@ copyMemory( DestinationElement* destination,
               for(int i=0;i<size;i++)
                   dst[i]=src_ptr.pointer[i];
           }
-          return true;
       }
    }
-   return false;
 #else
    throw Exceptions::MICSupportMissing();
 #endif
@@ -368,7 +358,7 @@ compareMemory( const Element1* destination,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool
+void
 ArrayOperations< Devices::MIC, Devices::Host >::
 copyMemory( DestinationElement* destination,
             const SourceElement* source,
@@ -393,8 +383,6 @@ copyMemory( DestinationElement* destination,
          {
               memcpy(dst_ptr.pointer,(void*)&tmp,size*sizeof(SourceElement));
          }
-
-         return true;
       }
       else
       {
@@ -404,7 +392,6 @@ copyMemory( DestinationElement* destination,
           {
               memcpy(dst_ptr.pointer,(void*)tmp,size*sizeof(SourceElement));
           }
-          return true;
       }
    }
    else
@@ -423,7 +410,6 @@ copyMemory( DestinationElement* destination,
               for(int i=0;i<size;i++)
                   dst_ptr.pointer[i]=src[i];
          }
-         return true;
       }
       else
       {
@@ -435,10 +421,8 @@ copyMemory( DestinationElement* destination,
               for(int i=0;i<size;i++)
                   dst_ptr.pointer[i]=src[i];
           }
-          return true;
       }
    }
-   return false;
 #else
    throw Exceptions::MICSupportMissing();
 #endif
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 78e408cc1b079c72ad30b6158c82b190a4a929f4..42b8bf28d16842f9d28a624f8995f2ffe959d943 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -18,7 +18,7 @@
 
 namespace TNL {
 namespace Containers {
-namespace Algorithms {   
+namespace Algorithms {
 
 template< typename Device >
 class Multireduction
@@ -30,7 +30,7 @@ class Multireduction< Devices::Cuda >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static void
    reduce( Operation& operation,
            const int n,
            const Index size,
@@ -45,7 +45,7 @@ class Multireduction< Devices::Host >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static void
    reduce( Operation& operation,
            const int n,
            const Index size,
@@ -60,7 +60,7 @@ class Multireduction< Devices::MIC >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static void
    reduce( Operation& operation,
            const int n,
            const Index size,
diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h
index 92c1f9b19c4a10a1aa967f97930df0550c058894..505c2be0e4d55b5e2b914a6d91e4d0739f28819a 100644
--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h
@@ -49,7 +49,7 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024
  *    hostResult: output array of size = n
  */
 template< typename Operation, typename Index >
-bool
+void
 Multireduction< Devices::Cuda >::
 reduce( Operation& operation,
         const int n,
@@ -75,18 +75,17 @@ reduce( Operation& operation,
     */
    if( n * ldInput1 < Multireduction_minGpuDataSize ) {
       DataType1 hostArray1[ Multireduction_minGpuDataSize ];
-      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, n * ldInput1 ) )
-         return false;
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, n * ldInput1 );
       if( deviceInput2 ) {
          using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
          _DT2 hostArray2[ Multireduction_minGpuDataSize ];
-         if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ) )
-            return false;
-         return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
+         ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
+         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
       }
       else {
-         return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult );
+         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult );
       }
+      return;
    }
 
    #ifdef CUDA_REDUCTION_PROFILING
@@ -117,8 +116,7 @@ reduce( Operation& operation,
     * Transfer the reduced data from device to host.
     */
    ResultType resultArray[ n * reducedSize ];
-   if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, n * reducedSize ) )
-      return false;
+   ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, n * reducedSize );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
@@ -146,7 +144,7 @@ reduce( Operation& operation,
       std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
    #endif
 
-   return TNL_CHECK_CUDA_DEVICE;
+   TNL_CHECK_CUDA_DEVICE;
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -163,7 +161,7 @@ reduce( Operation& operation,
  *    hostResult: output array of size = n
  */
 template< typename Operation, typename Index >
-bool
+void
 Multireduction< Devices::Host >::
 reduce( Operation& operation,
         const int n,
@@ -249,12 +247,10 @@ reduce( Operation& operation,
 #ifdef HAVE_OPENMP
    }
 #endif
-
-   return true;
 }
 
 template< typename Operation, typename Index >
-bool
+void
 Multireduction< Devices::MIC >::
 reduce( Operation& operation,
         const int n,
@@ -267,11 +263,9 @@ reduce( Operation& operation,
    TNL_ASSERT( n > 0, );
    TNL_ASSERT( size <= ldInput1, );
 
-   std::cout << "Not Implemented yet Multireduction< Devices::MIC >::reduce" << std::endl;
-   return true;
+   throw std::runtime_error("Not Implemented yet Multireduction< Devices::MIC >::reduce");
 }
 
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index e8e544db2091b86603ae02cb6fd86de2771a3fa7..d4f45f30e435590629475d107254822485b33979 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -30,12 +30,11 @@ class Reduction< Devices::Cuda >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static typename Operation::ResultType
    reduce( Operation& operation,
            const Index size,
            const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType& result );
+           const typename Operation::DataType2* deviceInput2 );
 };
 
 template<>
@@ -43,12 +42,11 @@ class Reduction< Devices::Host >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static typename Operation::ResultType
    reduce( Operation& operation,
            const Index size,
            const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType& result );
+           const typename Operation::DataType2* deviceInput2 );
 };
 
 template<>
@@ -56,12 +54,11 @@ class Reduction< Devices::MIC >
 {
 public:
    template< typename Operation, typename Index >
-   static bool
+   static typename Operation::ResultType
    reduce( Operation& operation,
            const Index size,
            const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType& result );
+           const typename Operation::DataType2* deviceInput2 );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/ReductionOperations.h b/src/TNL/Containers/Algorithms/ReductionOperations.h
index c6be17ed9ba270bf2beaab883ff21473de8bb9a9..33ef84b1c7f2fb0f9cb5ea4b72fdb9776fcce08c 100644
--- a/src/TNL/Containers/Algorithms/ReductionOperations.h
+++ b/src/TNL/Containers/Algorithms/ReductionOperations.h
@@ -462,7 +462,7 @@ public:
 };
 
 template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsSum : public ParallelReductionMax< Result, Result >
+class ParallelReductionDiffAbsSum : public ParallelReductionSum< Result, Result >
 {
 public:
    using DataType1 = Data1;
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h
index 9ebfce43f487759df2e55c7fe3224f2a9e95aa30..ee4e8d9792b9832a259d3ac07a571839837eac14 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -10,7 +10,7 @@
 
 // Implemented by: Tomas Oberhuber, Jakub Klinkovsky
 
-#pragma once 
+#pragma once
 
 #include "Reduction.h"
 
@@ -39,13 +39,12 @@ namespace Algorithms {
 static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
 
 template< typename Operation, typename Index >
-bool
+typename Operation::ResultType
 Reduction< Devices::Cuda >::
 reduce( Operation& operation,
         const Index size,
         const typename Operation::DataType1* deviceInput1,
-        const typename Operation::DataType2* deviceInput2,
-        typename Operation::ResultType& result )
+        const typename Operation::DataType2* deviceInput2 )
 {
 #ifdef HAVE_CUDA
 
@@ -54,7 +53,7 @@ reduce( Operation& operation,
    typedef typename Operation::DataType2 DataType2;
    typedef typename Operation::ResultType ResultType;
    typedef typename Operation::LaterReductionOperation LaterReductionOperation;
- 
+
    /***
     * Only fundamental and pointer types can be safely reduced on host. Complex
     * objects stored on the device might contain pointers into the device memory,
@@ -70,17 +69,15 @@ reduce( Operation& operation,
    if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
    {
       typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
-      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ) )
-         return false;
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
       if( deviceInput2 ) {
          using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
          typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
-         if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ) )
-            return false;
-         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result );
+         ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
+         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
       }
       else {
-         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result );
+         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
       }
    }
 
@@ -111,26 +108,27 @@ reduce( Operation& operation,
        * Transfer the reduced data from device to host.
        */
       ResultType resultArray[ reducedSize ];
-      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ) )
-         return false;
-    
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
          timer.reset();
          timer.start();
       #endif
-    
+
       /***
        * Reduce the data on the host system.
        */
       LaterReductionOperation laterReductionOperation;
-      Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr, result );
-    
+      const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );
+
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
+
+      return result;
    }
    else {
       /***
@@ -153,30 +151,28 @@ reduce( Operation& operation,
       #endif
 
       ResultType resultArray[ 1 ];
-      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ) )
-         return false;
-      result = resultArray[ 0 ];
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+      const ResultType result = resultArray[ 0 ];
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
+
+      return result;
    }
- 
-   return TNL_CHECK_CUDA_DEVICE;
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
 };
 
 template< typename Operation, typename Index >
-bool
+typename Operation::ResultType
 Reduction< Devices::Host >::
 reduce( Operation& operation,
         const Index size,
         const typename Operation::DataType1* input1,
-        const typename Operation::DataType2* input2,
-        typename Operation::ResultType& result )
+        const typename Operation::DataType2* input2 )
 {
    typedef Index IndexType;
    typedef typename Operation::DataType1 DataType1;
@@ -185,1542 +181,58 @@ reduce( Operation& operation,
 
 #ifdef HAVE_OPENMP
    constexpr int block_size = 128;
-   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size )
+   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+      // global result variable
+      ResultType result = operation.initialValue();
 #pragma omp parallel
-   {
-      const int blocks = size / block_size;
-
-      // first thread initializes the global result variable
-      #pragma omp single nowait
-      {
-         result = operation.initialValue();
-      }
-
-      // initialize thread-local result variable
-      ResultType r = operation.initialValue();
-
-      #pragma omp for nowait
-      for( int b = 0; b < blocks; b++ ) {
-         const int offset = b * block_size;
-         for( IndexType i = 0; i < block_size; i++ )
-            operation.firstReduction( r, offset + i, input1, input2 );
-      }
-
-      // the first thread that reaches here processes the last, incomplete block
-      #pragma omp single nowait
-      {
-         for( IndexType i = blocks * block_size; i < size; i++ )
-            operation.firstReduction( r, i, input1, input2 );
-      }
-
-      // inter-thread reduction of local results
-      #pragma omp critical
       {
-         operation.commonReduction( result, r );
+         const int blocks = size / block_size;
+
+         // initialize array for thread-local results
+         ResultType r[ 4 ] = { operation.initialValue() };
+
+         #pragma omp for nowait
+         for( int b = 0; b < blocks; b++ ) {
+            const int offset = b * block_size;
+            for( IndexType i = 0; i < block_size; i += 4 ) {
+               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
+               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
+               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
+               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+            }
+         }
+
+         // the first thread that reaches here processes the last, incomplete block
+         #pragma omp single nowait
+         {
+            for( IndexType i = blocks * block_size; i < size; i++ )
+               operation.firstReduction( r[ 0 ], i, input1, input2 );
+         }
+
+         // reduction of local results
+         operation.commonReduction( r[ 0 ], r[ 1 ] );
+         operation.commonReduction( r[ 0 ], r[ 2 ] );
+         operation.commonReduction( r[ 0 ], r[ 3 ] );
+
+         // inter-thread reduction of local results
+         #pragma omp critical
+         {
+            operation.commonReduction( result, r[ 0 ] );
+         }
       }
+      return result;
    }
    else {
 #endif
-      result = operation.initialValue();
+      ResultType result = operation.initialValue();
       for( IndexType i = 0; i < size; i++ )
          operation.firstReduction( result, i, input1, input2 );
+      return result;
 #ifdef HAVE_OPENMP
    }
 #endif
-
-   return true;
 }
 
-
-#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
-
-/****
- * Sum
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
-                                   ( const tnlParallelReductionSum< char, int >& operation,
-                                     const typename tnlParallelReductionSum< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > >
-                                   ( const tnlParallelReductionSum< int, int >& operation,
-                                     const typename tnlParallelReductionSum< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > >
-                                   ( const tnlParallelReductionSum< float, int >& operation,
-                                     const typename tnlParallelReductionSum< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
-                                   ( const tnlParallelReductionSum< double, int>& operation,
-                                     const typename tnlParallelReductionSum< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
-                                   ( const tnlParallelReductionSum< long double, int>& operation,
-                                     const typename tnlParallelReductionSum< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
-                                   ( const tnlParallelReductionSum< char, long int >& operation,
-                                     const typename tnlParallelReductionSum< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > >
-                                   ( const tnlParallelReductionSum< int, long int >& operation,
-                                     const typename tnlParallelReductionSum< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > >
-                                   ( const tnlParallelReductionSum< float, long int >& operation,
-                                     const typename tnlParallelReductionSum< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > >
-                                   ( const tnlParallelReductionSum< double, long int>& operation,
-                                     const typename tnlParallelReductionSum< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
-                                   ( const tnlParallelReductionSum< long double, long int>& operation,
-                                     const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionSum< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionSum< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Min
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
-                                   ( const tnlParallelReductionMin< char, int >& operation,
-                                     const typename tnlParallelReductionMin< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > >
-                                   ( const tnlParallelReductionMin< int, int >& operation,
-                                     const typename tnlParallelReductionMin< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > >
-                                   ( const tnlParallelReductionMin< float, int >& operation,
-                                     const typename tnlParallelReductionMin< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
-                                   ( const tnlParallelReductionMin< double, int >& operation,
-                                     const typename tnlParallelReductionMin< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
-                                   ( const tnlParallelReductionMin< long double, int>& operation,
-                                     const typename tnlParallelReductionMin< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
-                                   ( const tnlParallelReductionMin< char, long int >& operation,
-                                     const typename tnlParallelReductionMin< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > >
-                                   ( const tnlParallelReductionMin< int, long int >& operation,
-                                     const typename tnlParallelReductionMin< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > >
-                                   ( const tnlParallelReductionMin< float, long int >& operation,
-                                     const typename tnlParallelReductionMin< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > >
-                                   ( const tnlParallelReductionMin< double, long int>& operation,
-                                     const typename tnlParallelReductionMin< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
-                                   ( const tnlParallelReductionMin< long double, long int>& operation,
-                                     const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMin< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMin< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Max
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > >
-                                   ( const tnlParallelReductionMax< char, int >& operation,
-                                     const typename tnlParallelReductionMax< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > >
-                                   ( const tnlParallelReductionMax< int, int >& operation,
-                                     const typename tnlParallelReductionMax< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > >
-                                   ( const tnlParallelReductionMax< float, int >& operation,
-                                     const typename tnlParallelReductionMax< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
-                                   ( const tnlParallelReductionMax< double, int>& operation,
-                                     const typename tnlParallelReductionMax< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
-                                   ( const tnlParallelReductionMax< long double, int>& operation,
-                                     const typename tnlParallelReductionMax< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
-                                   ( const tnlParallelReductionMax< char, long int >& operation,
-                                     const typename tnlParallelReductionMax< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > >
-                                   ( const tnlParallelReductionMax< int, long int >& operation,
-                                     const typename tnlParallelReductionMax< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > >
-                                   ( const tnlParallelReductionMax< float, long int >& operation,
-                                     const typename tnlParallelReductionMax< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > >
-                                   ( const tnlParallelReductionMax< double, long int>& operation,
-                                     const typename tnlParallelReductionMax< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
-                                   ( const tnlParallelReductionMax< long double, long int>& operation,
-                                     const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionMax< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionMax< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-
-/****
- * Abs sum
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > >
-                                   ( const tnlParallelReductionAbsSum< char, int >& operation,
-                                     const typename tnlParallelReductionAbsSum< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > >
-                                   ( const tnlParallelReductionAbsSum< int, int >& operation,
-                                     const typename tnlParallelReductionAbsSum< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > >
-                                   ( const tnlParallelReductionAbsSum< float, int >& operation,
-                                     const typename tnlParallelReductionAbsSum< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
-                                   ( const tnlParallelReductionAbsSum< double, int>& operation,
-                                     const typename tnlParallelReductionAbsSum< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
-                                   ( const tnlParallelReductionAbsSum< long double, int>& operation,
-                                     const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
-                                   ( const tnlParallelReductionAbsSum< char, long int >& operation,
-                                     const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
-                                   ( const tnlParallelReductionAbsSum< int, long int >& operation,
-                                     const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > >
-                                   ( const tnlParallelReductionAbsSum< float, long int >& operation,
-                                     const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > >
-                                   ( const tnlParallelReductionAbsSum< double, long int>& operation,
-                                     const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
-                                   ( const tnlParallelReductionAbsSum< long double, long int>& operation,
-                                     const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsSum< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsSum< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Abs min
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
-                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
-                                     const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
-                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
-                                     const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
-                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
-                                     const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
-                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
-                                     const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
-                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
-                                     const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
-                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
-                                     const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
-                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
-                                     const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
-                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
-                                     const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
-                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
-                                     const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
-                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
-                                     const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMin< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMin< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Abs max
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
-                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
-                                     const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
-                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
-                                     const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
-                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
-                                     const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
-                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
-                                     const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
-                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
-                                     const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
-                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
-                                     const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
-                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
-                                     const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
-                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
-                                     const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
-                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
-                                     const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
-                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
-                                     const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionAbsMax< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionAbsMax< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Logical AND
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > >
-                                   ( const tnlParallelReductionLogicalAnd< char, int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > >
-                                   ( const tnlParallelReductionLogicalAnd< int, int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > >
-                                   ( const tnlParallelReductionLogicalAnd< float, int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > >
-                                   ( const tnlParallelReductionLogicalAnd< double, int>& operation,
-                                     const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
-                                   ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
-                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< int, long int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< float, long int >& operation,
-                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< double, long int>& operation,
-                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
-                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Logical OR
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > >
-                                   ( const tnlParallelReductionLogicalOr< char, int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > >
-                                   ( const tnlParallelReductionLogicalOr< int, int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > >
-                                   ( const tnlParallelReductionLogicalOr< float, int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > >
-                                   ( const tnlParallelReductionLogicalOr< double, int>& operation,
-                                     const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
-                                   ( const tnlParallelReductionLogicalOr< long double, int>& operation,
-                                     const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
-                                   ( const tnlParallelReductionLogicalOr< char, long int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > >
-                                   ( const tnlParallelReductionLogicalOr< int, long int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > >
-                                   ( const tnlParallelReductionLogicalOr< float, long int >& operation,
-                                     const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > >
-                                   ( const tnlParallelReductionLogicalOr< double, long int>& operation,
-                                     const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
-                                   ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
-                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Lp Norm
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > >
-                                   ( const tnlParallelReductionLpNorm< float, int >& operation,
-                                     const typename tnlParallelReductionLpNorm< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
-                                   ( const tnlParallelReductionLpNorm< double, int>& operation,
-                                     const typename tnlParallelReductionLpNorm< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
-                                   ( const tnlParallelReductionLpNorm< long double, int>& operation,
-                                     const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
-                                   ( const tnlParallelReductionLpNorm< char, long int >& operation,
-                                     const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
-                                   ( const tnlParallelReductionLpNorm< int, long int >& operation,
-                                     const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > >
-                                   ( const tnlParallelReductionLpNorm< float, long int >& operation,
-                                     const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > >
-                                   ( const tnlParallelReductionLpNorm< double, long int>& operation,
-                                     const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
-                                   ( const tnlParallelReductionLpNorm< long double, long int>& operation,
-                                     const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionLpNorm< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionLpNorm< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Equalities
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > >
-                                   ( const tnlParallelReductionEqualities< char, int >& operation,
-                                     const typename tnlParallelReductionEqualities< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > >
-                                   ( const tnlParallelReductionEqualities< int, int >& operation,
-                                     const typename tnlParallelReductionEqualities< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > >
-                                   ( const tnlParallelReductionEqualities< float, int >& operation,
-                                     const typename tnlParallelReductionEqualities< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > >
-                                   ( const tnlParallelReductionEqualities< double, int>& operation,
-                                     const typename tnlParallelReductionEqualities< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
-                                   ( const tnlParallelReductionEqualities< long double, int>& operation,
-                                     const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
-                                   ( const tnlParallelReductionEqualities< char, long int >& operation,
-                                     const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > >
-                                   ( const tnlParallelReductionEqualities< int, long int >& operation,
-                                     const typename tnlParallelReductionEqualities< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > >
-                                   ( const tnlParallelReductionEqualities< float, long int >& operation,
-                                     const typename tnlParallelReductionEqualities< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > >
-                                   ( const tnlParallelReductionEqualities< double, long int>& operation,
-                                     const typename tnlParallelReductionEqualities< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
-                                   ( const tnlParallelReductionEqualities< long double, long int>& operation,
-                                     const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionEqualities< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionEqualities< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Inequalities
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > >
-                                   ( const tnlParallelReductionInequalities< char, int >& operation,
-                                     const typename tnlParallelReductionInequalities< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > >
-                                   ( const tnlParallelReductionInequalities< int, int >& operation,
-                                     const typename tnlParallelReductionInequalities< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > >
-                                   ( const tnlParallelReductionInequalities< float, int >& operation,
-                                     const typename tnlParallelReductionInequalities< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > >
-                                   ( const tnlParallelReductionInequalities< double, int>& operation,
-                                     const typename tnlParallelReductionInequalities< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
-                                   ( const tnlParallelReductionInequalities< long double, int>& operation,
-                                     const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
-                                   ( const tnlParallelReductionInequalities< char, long int >& operation,
-                                     const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > >
-                                   ( const tnlParallelReductionInequalities< int, long int >& operation,
-                                     const typename tnlParallelReductionInequalities< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > >
-                                   ( const tnlParallelReductionInequalities< float, long int >& operation,
-                                     const typename tnlParallelReductionInequalities< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > >
-                                   ( const tnlParallelReductionInequalities< double, long int>& operation,
-                                     const typename tnlParallelReductionInequalities< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
-                                   ( const tnlParallelReductionInequalities< long double, long int>& operation,
-                                     const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionInequalities< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionInequalities< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * ScalarProduct
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, int > >
-                                   ( const tnlParallelReductionScalarProduct< char, int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< int, int > >
-                                   ( const tnlParallelReductionScalarProduct< int, int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< float, int > >
-                                   ( const tnlParallelReductionScalarProduct< float, int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double, int > >
-                                   ( const tnlParallelReductionScalarProduct< double, int>& operation,
-                                     const typename tnlParallelReductionScalarProduct< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
-                                   ( const tnlParallelReductionScalarProduct< long double, int>& operation,
-                                     const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
-                                   ( const tnlParallelReductionScalarProduct< char, long int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< int, long int > >
-                                   ( const tnlParallelReductionScalarProduct< int, long int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< float, long int > >
-                                   ( const tnlParallelReductionScalarProduct< float, long int >& operation,
-                                     const typename tnlParallelReductionScalarProduct< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double, long int > >
-                                   ( const tnlParallelReductionScalarProduct< double, long int>& operation,
-                                     const typename tnlParallelReductionScalarProduct< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
-                                   ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
-                                     const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionScalarProduct< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionScalarProduct< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff sum
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, int > >
-                                   ( const tnlParallelReductionDiffSum< char, int >& operation,
-                                     const typename tnlParallelReductionDiffSum< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< int, int > >
-                                   ( const tnlParallelReductionDiffSum< int, int >& operation,
-                                     const typename tnlParallelReductionDiffSum< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< float, int > >
-                                   ( const tnlParallelReductionDiffSum< float, int >& operation,
-                                     const typename tnlParallelReductionDiffSum< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int > >
-                                   ( const tnlParallelReductionDiffSum< double, int>& operation,
-                                     const typename tnlParallelReductionDiffSum< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
-                                   ( const tnlParallelReductionDiffSum< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
-                                   ( const tnlParallelReductionDiffSum< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< int, long int > >
-                                   ( const tnlParallelReductionDiffSum< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffSum< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< float, long int > >
-                                   ( const tnlParallelReductionDiffSum< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffSum< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long int > >
-                                   ( const tnlParallelReductionDiffSum< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffSum< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
-                                   ( const tnlParallelReductionDiffSum< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffSum< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffSum< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff min
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, int > >
-                                   ( const tnlParallelReductionDiffMin< char, int >& operation,
-                                     const typename tnlParallelReductionDiffMin< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< int, int > >
-                                   ( const tnlParallelReductionDiffMin< int, int >& operation,
-                                     const typename tnlParallelReductionDiffMin< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< float, int > >
-                                   ( const tnlParallelReductionDiffMin< float, int >& operation,
-                                     const typename tnlParallelReductionDiffMin< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int > >
-                                   ( const tnlParallelReductionDiffMin< double, int>& operation,
-                                     const typename tnlParallelReductionDiffMin< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
-                                   ( const tnlParallelReductionDiffMin< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
-                                   ( const tnlParallelReductionDiffMin< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< int, long int > >
-                                   ( const tnlParallelReductionDiffMin< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffMin< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< float, long int > >
-                                   ( const tnlParallelReductionDiffMin< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffMin< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long int > >
-                                   ( const tnlParallelReductionDiffMin< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffMin< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
-                                   ( const tnlParallelReductionDiffMin< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMin< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMin< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff max
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, int > >
-                                   ( const tnlParallelReductionDiffMax< char, int >& operation,
-                                     const typename tnlParallelReductionDiffMax< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< int, int > >
-                                   ( const tnlParallelReductionDiffMax< int, int >& operation,
-                                     const typename tnlParallelReductionDiffMax< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< float, int > >
-                                   ( const tnlParallelReductionDiffMax< float, int >& operation,
-                                     const typename tnlParallelReductionDiffMax< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int > >
-                                   ( const tnlParallelReductionDiffMax< double, int>& operation,
-                                     const typename tnlParallelReductionDiffMax< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
-                                   ( const tnlParallelReductionDiffMax< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
-                                   ( const tnlParallelReductionDiffMax< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< int, long int > >
-                                   ( const tnlParallelReductionDiffMax< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffMax< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< float, long int > >
-                                   ( const tnlParallelReductionDiffMax< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffMax< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long int > >
-                                   ( const tnlParallelReductionDiffMax< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffMax< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
-                                   ( const tnlParallelReductionDiffMax< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffMax< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffMax< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff abs sum
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< char, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< int, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< int, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< float, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< float, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsSum< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff abs min
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< char, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< int, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< int, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< float, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< float, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMin< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-/****
- * Diff abs max
- */
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< char, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< char, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< char, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< char, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< char, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< int, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< int, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< int, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< int, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< int, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< int, int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< float, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< float, int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffAbsMax< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-
-/****
- * Diff Lp Norm
- */
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< float, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< float, int >& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< float, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< float, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< float, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< float, int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< double, int>& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< char, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< char, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< char, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< int, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< int, long int >& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< int, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< int, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< int, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< int, long int > :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< float, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< float, long int >& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< float, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< float, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< float, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< float, long int> :: ResultType& result );
-
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< double, long int>& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, long int > :: DataType1* deviceInput1,
-                                     const typename tnlParallelReductionDiffLpNorm< long double, long int > :: DataType2* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
-#endif
-#endif
-
-#endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
index b1bc4dec94817845791641d9a59b9d5bb43ded35..b87d633e1c919214a9c0ff4c27f2b341f383b501 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
@@ -49,14 +49,11 @@ getVectorMax( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionMax< RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( RealType* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -68,14 +65,11 @@ getVectorMin( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionMin< RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( RealType* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -87,14 +81,11 @@ getVectorAbsMax( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( RealType* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -106,14 +97,11 @@ getVectorAbsMin( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( RealType* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -125,14 +113,11 @@ getVectorL1Norm( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( RealType* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -144,13 +129,11 @@ getVectorL2Norm( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionL2Norm< Real, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( Real* ) 0,
-                                       result );
+   const ResultType result = Reduction< Devices::Cuda >::reduce( operation,
+                                                                 v.getSize(),
+                                                                 v.getData(),
+                                                                 ( Real* ) 0 );
    return std::sqrt( result );
 }
 
@@ -169,14 +152,13 @@ getVectorLpNorm( const Vector& v,
       return getVectorL1Norm< Vector, ResultType >( v );
    if( p == 2 )
       return getVectorL2Norm< Vector, ResultType >( v );
-   ResultType result( 0 );
+
    Algorithms::ParallelReductionLpNorm< Real, ResultType, Real_ > operation;
    operation.setPower( p );
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( Real* ) 0,
-                                       result );
+   const ResultType result = Reduction< Devices::Cuda >::reduce( operation,
+                                                                 v.getSize(),
+                                                                 v.getData(),
+                                                                 ( Real* ) 0 );
    return std::pow( result, 1.0 / p );
 }
 
@@ -189,14 +171,11 @@ getVectorSum( const Vector& v )
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionSum< Real, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v.getSize(),
-                                       v.getData(),
-                                       ( Real* ) 0,
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( Real* ) 0 );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -208,14 +187,11 @@ getVectorDifferenceMax( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -227,14 +203,11 @@ getVectorDifferenceMin( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 
@@ -247,14 +220,11 @@ getVectorDifferenceAbsMax( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -266,14 +236,11 @@ getVectorDifferenceAbsMin( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -285,14 +252,11 @@ getVectorDifferenceL1Norm( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -301,18 +265,14 @@ VectorOperations< Devices::Cuda >::
 getVectorDifferenceL2Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1::RealType Real;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
+   const ResultType result = Reduction< Devices::Cuda >::reduce( operation,
+                                                                 v1.getSize(),
+                                                                 v1.getData(),
+                                                                 v2.getData() );
    return std::sqrt( result );
 }
 
@@ -323,20 +283,21 @@ getVectorDifferenceLpNorm( const Vector1& v1,
                            const Vector2& v2,
                            const Real_ p )
 {
-   typedef typename Vector1::RealType Real;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
    TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
 
-   ResultType result( 0 );
+   if( p == 1.0 )
+      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
+   if( p == 2.0 )
+      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
+
    Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Real_ > operation;
    operation.setPower( p );
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
+   const ResultType result = Reduction< Devices::Cuda >::reduce( operation,
+                                                                 v1.getSize(),
+                                                                 v1.getData(),
+                                                                 v2.getData() );
    return std::pow( result, 1.0 / p );
 }
 
@@ -346,19 +307,14 @@ VectorOperations< Devices::Cuda >::
 getVectorDifferenceSum( const Vector1& v1,
                         const Vector2& v2 )
 {
-   typedef typename Vector1::RealType Real;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 #ifdef HAVE_CUDA
@@ -412,14 +368,11 @@ getScalarProduct( const Vector1& v1,
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0 );
    Algorithms::ParallelReductionScalarProduct< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   Reduction< Devices::Cuda >::reduce( operation,
-                                       v1.getSize(),
-                                       v1.getData(),
-                                       v2.getData(),
-                                       result );
-   return result;
+   return Reduction< Devices::Cuda >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
index ef938886ed93821d1cd071402c2ac85b66e22c42..5f51938afb301361737a307a9b51ccb59a7f288a 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
@@ -46,18 +46,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorMax( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType RealType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result = v.getElement( 0 );
-   const Index n = v.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result = max( result, v.getElement( i ) );
-   return result;
+   Algorithms::ParallelReductionMax< RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -65,18 +62,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorMin( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType RealType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result = v.getElement( 0 );
-   const Index n = v.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result = min( result, v.getElement( i ) );
-   return result;
+   Algorithms::ParallelReductionMin< RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -84,18 +78,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorAbsMax( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType RealType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result = TNL::abs( v.getElement( 0 ) );
-   const Index n = v.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result = max( result, (ResultType) TNL::abs( v.getElement( i ) ) );
-   return result;
+   Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 
@@ -104,18 +95,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorAbsMin( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType RealType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result = TNL::abs( v.getElement( 0 ) );
-   const Index n = v.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result = min( result, (ResultType) TNL::abs( v.getElement( i ) ) );
-   return result;
+   Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -123,18 +111,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorL1Norm( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType RealType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0.0 );
-   const Index n = v.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += TNL::abs( v[ i ] );
-   return result;
+   Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( RealType* ) 0 );
 }
 
 template< typename Vector, typename ResultType >
@@ -143,60 +128,15 @@ VectorOperations< Devices::Host >::
 getVectorL2Norm( const Vector& v )
 {
    typedef typename Vector::RealType Real;
-   typedef typename Vector::IndexType Index;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   const Index n = v.getSize();
-
-#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
-#ifdef __GNUC__
-   // We need to get the address of the first element to avoid
-   // bounds checking in TNL::Array::operator[]
-   const Real* V = v.getData();
-#endif
-
-   ResultType result1 = 0, result2 = 0, result3 = 0, result4 = 0;
-   Index i = 0;
-   const Index unroll_limit = n - n % 4;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for \
-       if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \
-       reduction(+:result1,result2,result3,result4) \
-       lastprivate(i)
-#endif
-   for( i = 0; i < unroll_limit; i += 4 )
-   {
-#ifdef __GNUC__
-      __builtin_prefetch(V + i + PrefetchDistance, 0, 0);
-#endif
-      result1 += v[ i ] * v[ i ];
-      result2 += v[ i + 1 ] * v[ i + 1 ];
-      result3 += v[ i + 2 ] * v[ i + 2 ];
-      result4 += v[ i + 3 ] * v[ i + 3 ];
-   }
-
-   while( i < n )
-   {
-      result1 += v[ i ] * v[ i ];
-      i++;
-   }
-
-   return std::sqrt(result1 + result2 + result3 + result4);
-
-#else // OPTIMIZED_VECTOR_HOST_OPERATIONS
-
-   ResultType result( 0.0 );
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-   {
-      const Real& aux = v[ i ];
-      result += aux * aux;
-   }
+   Algorithms::ParallelReductionL2Norm< Real, ResultType > operation;
+   const ResultType result = Reduction< Devices::Host >::reduce( operation,
+                                                                 v.getSize(),
+                                                                 v.getData(),
+                                                                 ( Real* ) 0 );
    return std::sqrt( result );
-#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
 }
 
 template< typename Vector, typename ResultType, typename Real_ >
@@ -205,7 +145,7 @@ VectorOperations< Devices::Host >::
 getVectorLpNorm( const Vector& v,
                  const Real_ p )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType Real;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
@@ -215,13 +155,12 @@ getVectorLpNorm( const Vector& v,
    if( p == 2.0 )
       return getVectorL2Norm< Vector, ResultType >( v );
 
-   ResultType result( 0.0 );
-   const Index n = v.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += std::pow( TNL::abs( v[ i ] ), p );
+   Algorithms::ParallelReductionLpNorm< Real, ResultType, Real_ > operation;
+   operation.setPower( p );
+   const ResultType result = Reduction< Devices::Host >::reduce( operation,
+                                                                 v.getSize(),
+                                                                 v.getData(),
+                                                                 ( Real* ) 0 );
    return std::pow( result, 1.0 / p );
 }
 
@@ -230,18 +169,15 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorSum( const Vector& v )
 {
-   typedef typename Vector::IndexType Index;
+   typedef typename Vector::RealType Real;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   ResultType result( 0.0 );
-   const Index n = v.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += v[ i ];
-   return result;
+   Algorithms::ParallelReductionSum< Real, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v.getSize(),
+                                              v.getData(),
+                                              ( Real* ) 0 );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -250,19 +186,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceMax( const Vector1& v1,
                         const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result = v1.getElement( 0 ) - v2.getElement( 0 );
-   const Index n = v1.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result =  max( result, v1.getElement( i ) - v2.getElement( i ) );
-   return result;
+   Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -271,19 +202,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceMin( const Vector1& v1,
                         const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result = v1.getElement( 0 ) - v2.getElement( 0 );
-   const Index n = v1.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result =  min( result, v1.getElement( i ) - v2.getElement( i ) );
-   return result;
+   Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -292,19 +218,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceAbsMax( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result = TNL::abs( v1.getElement( 0 ) - v2.getElement( 0 ) );
-   const Index n = v1.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result =  max( result, (ResultType) TNL::abs( v1.getElement( i ) - v2.getElement( i ) ) );
-   return result;
+   Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -313,19 +234,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceAbsMin( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result = TNL::abs( v1[ 0 ] - v2[ 0 ] );
-   const Index n = v1.getSize();
-#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
-#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 1; i < n; i ++ )
-      result =  min( result, (ResultType) TNL::abs( v1[ i ] - v2[ i ] ) );
-   return result;
+   Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -334,19 +250,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceL1Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0.0 );
-   const Index n = v1.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += TNL::abs( v1[ i ] - v2[ i ] );
-   return result;
+   Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2, typename ResultType >
@@ -355,21 +266,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceL2Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0.0 );
-   const Index n = v1.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-   {
-      ResultType aux = TNL::abs( v1[ i ] - v2[ i ] );
-      result += aux * aux;
-   }
+   Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   const ResultType result = Reduction< Devices::Host >::reduce( operation,
+                                                                 v1.getSize(),
+                                                                 v1.getData(),
+                                                                 v2.getData() );
    return std::sqrt( result );
 }
 
@@ -381,8 +285,6 @@ getVectorDifferenceLpNorm( const Vector1& v1,
                            const Vector2& v2,
                            const Real_ p )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
    TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
@@ -392,13 +294,12 @@ getVectorDifferenceLpNorm( const Vector1& v1,
    if( p == 2.0 )
       return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
 
-   ResultType result( 0.0 );
-   const Index n = v1.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += std::pow( TNL::abs( v1.getElement( i ) - v2.getElement( i ) ), p );
+   Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Real_ > operation;
+   operation.setPower( p );
+   const ResultType result = Reduction< Devices::Host >::reduce( operation,
+                                                                 v1.getSize(),
+                                                                 v1.getData(),
+                                                                 v2.getData() );
    return std::pow( result, 1.0 / p );
 }
 
@@ -408,19 +309,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceSum( const Vector1& v1,
                         const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
 
-   ResultType result( 0.0 );
-   const Index n = v1.getSize();
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i ++ )
-      result += v1.getElement( i ) - v2.getElement( i );
-   return result;
+   Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 
@@ -449,60 +345,14 @@ VectorOperations< Devices::Host >::
 getScalarProduct( const Vector1& v1,
                   const Vector2& v2 )
 {
-   typedef typename Vector1::IndexType Index;
-
    TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
    TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-   const Index n = v1.getSize();
-
-#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
-#ifdef __GNUC__
-   typedef typename Vector1::RealType Real;
-   // We need to get the address of the first element to avoid
-   // bounds checking in TNL::Array::operator[]
-   const Real* V1 = v1.getData();
-   const Real* V2 = v2.getData();
-#endif
-
-   ResultType dot1 = 0.0, dot2 = 0.0, dot3 = 0.0, dot4 = 0.0;
-   Index i = 0;
-   const Index unroll_limit = n - n % 4;
-#ifdef HAVE_OPENMP
-   #pragma omp parallel for \
-      if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \
-      reduction(+:dot1,dot2,dot3,dot4) \
-      lastprivate(i)
-#endif
-   for( i = 0; i < unroll_limit; i += 4 )
-   {
-#ifdef __GNUC__
-      __builtin_prefetch(V1 + i + PrefetchDistance, 0, 0);
-      __builtin_prefetch(V2 + i + PrefetchDistance, 0, 0);
-#endif
-      dot1 += v1[ i ]     * v2[ i ];
-      dot2 += v1[ i + 1 ] * v2[ i + 1 ];
-      dot3 += v1[ i + 2 ] * v2[ i + 2 ];
-      dot4 += v1[ i + 3 ] * v2[ i + 3 ];
-   }
-
-   while( i < n )
-   {
-      dot1 += v1[ i ] * v2[ i ];
-      i++;
-   }
-
-   return dot1 + dot2 + dot3 + dot4;
-
-#else // OPTIMIZED_VECTOR_HOST_OPERATIONS
 
-   ResultType result( 0.0 );
-#ifdef HAVE_OPENMP
-   #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
-#endif
-   for( Index i = 0; i < n; i++ )
-      result += v1[ i ] * v2[ i ];
-   return result;
-#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
+   Algorithms::ParallelReductionScalarProduct< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
+   return Reduction< Devices::Host >::reduce( operation,
+                                              v1.getSize(),
+                                              v1.getData(),
+                                              v2.getData() );
 }
 
 template< typename Vector1, typename Vector2 >
@@ -520,42 +370,6 @@ addVector( Vector1& y,
 
    const Index n = y.getSize();
 
-#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
-#ifdef __GNUC__
-   // We need to get the address of the first element to avoid
-   // bounds checking in TNL::Array::operator[]
-   typedef typename Vector1::RealType Real;   
-         Real* Y = y.getData();
-   const Real* X = x.getData();
-#endif
-
-   Index i = 0;
-   const Index unroll_limit = n - n % 4;
-#ifdef HAVE_OPENMP
-   #pragma omp parallel for \
-      if( n > OpenMPVectorOperationsThreshold ) \
-      lastprivate(i)
-#endif
-   for(i = 0; i < unroll_limit; i += 4)
-   {
-#ifdef __GNUC__
-      __builtin_prefetch(&y[ i + PrefetchDistance ], 1, 0);
-      __builtin_prefetch(&x[ i + PrefetchDistance ], 0, 0);
-#endif
-      y[ i ]     = thisMultiplicator * y[ i ]     + alpha * x[ i ];
-      y[ i + 1 ] = thisMultiplicator * y[ i + 1 ] + alpha * x[ i + 1 ];
-      y[ i + 2 ] = thisMultiplicator * y[ i + 2 ] + alpha * x[ i + 2 ];
-      y[ i + 3 ] = thisMultiplicator * y[ i + 3 ] + alpha * x[ i + 3 ];
-   }
-
-   while( i < n )
-   {
-      y[i] = thisMultiplicator * y[ i ] + alpha * x[ i ];
-      i++;
-   }
-
-#else // OPTIMIZED_VECTOR_HOST_OPERATIONS
-
    if( thisMultiplicator == 1.0 )
 #ifdef HAVE_OPENMP
 #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
@@ -568,7 +382,6 @@ addVector( Vector1& y,
 #endif
       for( Index i = 0; i < n; i ++ )
          y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
-#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
 }
 
 template< typename Vector1,
diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
index f9aa3dfae9c8e69fbd33cbe0ea087c1500b2032b..9687a7e2af910833855f649aad2e5899c816c4fa 100644
--- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
+++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
@@ -321,73 +321,6 @@ cudaPrefixSum( const Index size,
    }
 }
 
-#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
-extern template bool cudaPrefixSum( const int size,
-                                    const int blockSize,
-                                    const int *deviceInput,
-                                    int* deviceOutput,
-                                    tnlParallelReductionSum< int, int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-
-extern template bool cudaPrefixSum( const int size,
-                                    const int blockSize,
-                                    const float *deviceInput,
-                                    float* deviceOutput,
-                                    tnlParallelReductionSum< float, int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-extern template bool cudaPrefixSum( const int size,
-                                    const int blockSize,
-                                    const double *deviceInput,
-                                    double* deviceOutput,
-                                    tnlParallelReductionSum< double, int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool cudaPrefixSum( const int size,
-                                    const int blockSize,
-                                    const long double *deviceInput,
-                                    long double* deviceOutput,
-                                    tnlParallelReductionSum< long double, int >& operation,
-                                    const PrefixSumType prefixSumType );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-extern template bool cudaPrefixSum( const long int size,
-                                    const long int blockSize,
-                                    const int *deviceInput,
-                                    int* deviceOutput,
-                                    tnlParallelReductionSum< int, long int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-
-extern template bool cudaPrefixSum( const long int size,
-                                    const long int blockSize,
-                                    const float *deviceInput,
-                                    float* deviceOutput,
-                                    tnlParallelReductionSum< float, long int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-extern template bool cudaPrefixSum( const long int size,
-                                    const long int blockSize,
-                                    const double *deviceInput,
-                                    double* deviceOutput,
-                                    tnlParallelReductionSum< double, long int >& operation,
-                                    const PrefixSumType prefixSumType );
-
-#ifdef INSTANTIATE_LONG_DOUBLE
-extern template bool cudaPrefixSum( const long int size,
-                                    const long int blockSize,
-                                    const long double *deviceInput,
-                                    long double* deviceOutput,
-                                    tnlParallelReductionSum< long double, long int >& operation,
-                                    const PrefixSumType prefixSumType );
-#endif
-#endif
-
-#endif
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 8b2b1ef33e64b1dbd2330d375594ada14d19eacc..7e98a0d3db82209e70e810f6773f56e12094f18a 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -93,8 +93,8 @@ public:
    // must match (i.e. copy-assignment cannot resize).
    ArrayView& operator=( const ArrayView& view );
 
-   template< typename Value_, typename Device_, typename Index_ >
-   ArrayView& operator=( const ArrayView< Value_, Device_, Index_ >& view );
+   template< typename Array >
+   ArrayView& operator=( const Array& array );
 
 
    static String getType();
diff --git a/src/TNL/Containers/ArrayView_impl.h b/src/TNL/Containers/ArrayView_impl.h
index d755c4c3111812df4a00f8f633c44f4736c0aefc..96d00e7dea42ae244f928f5679a1045ba78af856 100644
--- a/src/TNL/Containers/ArrayView_impl.h
+++ b/src/TNL/Containers/ArrayView_impl.h
@@ -123,14 +123,14 @@ operator=( const ArrayView& view )
 template< typename Value,
            typename Device,
            typename Index >
-   template< typename Value_, typename Device_, typename Index_ >
+   template< typename Array >
 ArrayView< Value, Device, Index >&
 ArrayView< Value, Device, Index >::
-operator=( const ArrayView< Value_, Device_, Index_ >& view )
+operator=( const Array& array )
 {
-   TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( getSize(), array.getSize(), "The sizes of the array views must be equal, views are not resizable." );
    if( getSize() > 0 )
-      Algorithms::ArrayOperations< Device, Device_ >::copyMemory( getData(), view.getData(), getSize() );
+      Algorithms::ArrayOperations< Device, typename Array::DeviceType >::copyMemory( getData(), array.getData(), getSize() );
    return *this;
 }
 
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 4ef863aa90dc27c69aa14b119414f8fa0b0dddbd..fe5a650ef3a4a2711d5f46be27c14d97031d472b 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -42,14 +42,24 @@ public:
 #ifndef __NVCC__
    using BaseType::ArrayView;
 #else
-   // workaround for a bug in nvcc 8.0 (seems to be fixed in 9.0)
+   // workaround for nvcc 8.0, otherwise the templated constructor below fails
+   // (works fine in nvcc 9.0)
    using ArrayView< Real, Device, Index >::ArrayView;
 #endif
 
+   // inherit all ArrayView's assignment operators
+   using BaseType::operator=;
+
+   // In C++14, default constructors cannot be inherited, although Clang
+   // and GCC since version 7.0 inherit them.
+   // https://stackoverflow.com/a/51854172
+   __cuda_callable__
+   VectorView() = default;
+
    // initialization by base class is not a copy constructor so it has to be explicit
-   template< typename Element_ >  // template catches both const and non-const qualified Element
+   template< typename Real_ >  // template catches both const and non-const qualified Element
    __cuda_callable__
-   VectorView( const ArrayView< Element_, Device, Index >& view )
+   VectorView( const ArrayView< Real_, Device, Index >& view )
    : BaseType::ArrayView( view ) {}
 
 
diff --git a/src/TNL/Devices/Cuda.cu b/src/TNL/Devices/Cuda.cu
index 2605e6dca83290eb59db54618b7bf91ed1e59150..c1e5248330d0b46d23258a200238756c36aa314d 100644
--- a/src/TNL/Devices/Cuda.cu
+++ b/src/TNL/Devices/Cuda.cu
@@ -103,11 +103,10 @@ void Cuda::printThreadsSetup( const dim3& blockSize,
 }
 
 
-bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
-{   
-   if( error == cudaSuccess )
-      return true;
-   throw Exceptions::CudaRuntimeError( error, file_name, line );
+void Cuda::checkDevice( const char* file_name, int line, cudaError error )
+{
+   if( error != cudaSuccess )
+      throw Exceptions::CudaRuntimeError( error, file_name, line );
 }
 
 std::ostream& operator << ( std::ostream& str, const dim3& d )
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index c73e327e9ac84ab10752a66e783a46da1b288c72..123d3a96c6d940be44208785fb5c776586a0d52d 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -153,9 +153,9 @@ class Cuda
     * of calling cudaGetLastError() inside the method.
     * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow.
     */
-   static bool checkDevice( const char* file_name, int line, cudaError error );
+   static void checkDevice( const char* file_name, int line, cudaError error );
 #else
-   static bool checkDevice() { return false; };
+   static void checkDevice() {}
 #endif
    
    static void configSetup( Config::ConfigDescription& config, const String& prefix = "" );
diff --git a/src/TNL/Exceptions/CMakeLists.txt b/src/TNL/Exceptions/CMakeLists.txt
index 5af96a6b5ed1e9a2078da708a30a8e4bdf4c6159..28ffbdf4fa12617b8e515ca59f04843b8af1a45c 100644
--- a/src/TNL/Exceptions/CMakeLists.txt
+++ b/src/TNL/Exceptions/CMakeLists.txt
@@ -3,7 +3,6 @@ SET( headers CudaBadAlloc.h
              CudaSupportMissing.h
              MICBadAlloc.h
              MICSupportMissing.h
-             MPISupportMissing.h
-             UnsupportedDimension.h )
+             MPISupportMissing.h )
 
 INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Exceptions )
diff --git a/src/TNL/Exceptions/MPISupportMissing.h b/src/TNL/Exceptions/MPISupportMissing.h
index 01a08e8973f8d30b75dd3f70a1de0598d6ccd5c5..0cbe7357da60a4c3f3a435d33aaf81def7b3a9ad 100644
--- a/src/TNL/Exceptions/MPISupportMissing.h
+++ b/src/TNL/Exceptions/MPISupportMissing.h
@@ -1,15 +1,13 @@
 /***************************************************************************
                           MPISupportMissing.h  -  description
                              -------------------
-    begin                : Jun 11, 2017
+    begin                : Jun 11, 2018
     copyright            : (C) 2018 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-// Implemented by: Jakub Klinkovsky
-
 #pragma once
 
 #include <stdexcept>
diff --git a/src/TNL/Exceptions/UnsupportedDimension.h b/src/TNL/Exceptions/UnsupportedDimension.h
deleted file mode 100644
index cdb9197370cf24376e41922d1d52364d0fc115aa..0000000000000000000000000000000000000000
--- a/src/TNL/Exceptions/UnsupportedDimension.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-                          UnsupportedDimension.h  -  description
-                             -------------------
-    begin                : Aug 14, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-namespace TNL {
-namespace Exceptions {
-
-struct UnsupportedDimension
-{
-   UnsupportedDimension( int Dimension )
-   : Dimension( Dimension )
-   {
-   }
-   
-   const char* what() const throw()
-   {
-      return "This dimension is not supported (yet).";
-   }
-   
-   int Dimension;
-};
-
-} // namespace Exceptions
-} // namespace TNL
diff --git a/src/TNL/File_impl.h b/src/TNL/File_impl.h
index 0b7d18ad3bb04d4ebca507a021e60825c7c04f19..a27250242144156fdf156b3d889cda3420b069e0 100644
--- a/src/TNL/File_impl.h
+++ b/src/TNL/File_impl.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <type_traits>
+#include <memory>
 
 #include <TNL/File.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
@@ -89,35 +90,27 @@ bool File::read_impl( Type* buffer,
    this->readElements = 0;
    const std::size_t host_buffer_size = std::min( FileGPUvsCPUTransferBufferSize / sizeof( Type ), elements );
    using BaseType = typename std::remove_cv< Type >::type;
-   BaseType* host_buffer = new BaseType[ host_buffer_size ];
+   std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] };
 
    while( readElements < elements )
    {
       std::size_t transfer = std::min( elements - readElements, host_buffer_size );
-      std::size_t transfered = std::fread( host_buffer, sizeof( Type ), transfer, file );
+      std::size_t transfered = std::fread( host_buffer.get(), sizeof( Type ), transfer, file );
       if( transfered != transfer )
       {
          std::cerr << "I am not able to read the data from the file " << fileName << "." << std::endl;
          std::cerr << transfered << " bytes were transfered. " << std::endl;
          std::perror( "Fread ended with the error code" );
-         delete[] host_buffer;
          return false;
       }
 
-      cudaMemcpy( ( void* ) & ( buffer[ readElements ] ),
-                  host_buffer,
+      cudaMemcpy( (void*) &buffer[ readElements ],
+                  (void*) host_buffer.get(),
                   transfer * sizeof( Type ),
                   cudaMemcpyHostToDevice );
-      if( ! TNL_CHECK_CUDA_DEVICE )
-      {
-         std::cerr << "Transfer of data from the CUDA device to the file " << this->fileName
-              << " failed." << std::endl;
-         delete[] host_buffer;
-         return false;
-      }
+      TNL_CHECK_CUDA_DEVICE;
       this->readElements += transfer;
    }
-   delete[] host_buffer;
    return true;
 #else
    throw Exceptions::CudaSupportMissing();
@@ -233,35 +226,27 @@ bool File::write_impl( const Type* buffer,
    const std::size_t host_buffer_size = std::min( FileGPUvsCPUTransferBufferSize / sizeof( Type ),
                                              elements );
    using BaseType = typename std::remove_cv< Type >::type;
-   BaseType* host_buffer = new BaseType[ host_buffer_size ];
+   std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] };
 
    while( this->writtenElements < elements )
    {
       std::size_t transfer = std::min( elements - this->writtenElements, host_buffer_size );
-      cudaMemcpy( host_buffer,
-                  ( void* ) & ( buffer[ this->writtenElements ] ),
+      cudaMemcpy( (void*) host_buffer.get(),
+                  (void*) &buffer[ this->writtenElements ],
                   transfer * sizeof( Type ),
                   cudaMemcpyDeviceToHost );
-      if( ! TNL_CHECK_CUDA_DEVICE )
-      {
-         std::cerr << "Transfer of data from the file " << this->fileName
-              << " to the CUDA device failed." << std::endl;
-         delete[] host_buffer;
-         return false;
-      }
-      if( std::fwrite( host_buffer,
+      TNL_CHECK_CUDA_DEVICE;
+      if( std::fwrite( host_buffer.get(),
                        sizeof( Type ),
                        transfer,
                        this->file ) != transfer )
       {
          std::cerr << "I am not able to write the data to the file " << fileName << "." << std::endl;
          std::perror( "Fwrite ended with the error code" );
-         delete[] host_buffer;
          return false;
       }
       this->writtenElements += transfer;
    }
-   delete[] host_buffer;
    return true;
 #else
    throw Exceptions::CudaSupportMissing();
diff --git a/src/TNL/Functions/TestFunction_impl.h b/src/TNL/Functions/TestFunction_impl.h
index 5a7e76485339306c4e506202d3574d59571e6b45..3e7da8c33ddc00b236f75cfac25646741eb0e78d 100644
--- a/src/TNL/Functions/TestFunction_impl.h
+++ b/src/TNL/Functions/TestFunction_impl.h
@@ -137,8 +137,7 @@ setupFunction( const Config::ParameterContainer& parameters,
    {
       this->function = Devices::Cuda::passToDevice( *auxFunction );
       delete auxFunction;
-      if( ! TNL_CHECK_CUDA_DEVICE )
-         return false;
+      TNL_CHECK_CUDA_DEVICE;
    }
    return true;
 }
@@ -167,8 +166,7 @@ setupOperator( const Config::ParameterContainer& parameters,
    {
       this->operator_ = Devices::Cuda::passToDevice( *auxOperator );
       delete auxOperator;
-      if( ! TNL_CHECK_CUDA_DEVICE )
-         return false;
+      TNL_CHECK_CUDA_DEVICE;
    }
    return true;
 }
diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h
index 2d26fe9954cc5c21540485997072856c8c316f3d..f5fdc767346f9fbc90236f04ab000f3fdc20327b 100644
--- a/src/TNL/Matrices/AdEllpack.h
+++ b/src/TNL/Matrices/AdEllpack.h
@@ -83,6 +83,7 @@ public:
     typedef Device DeviceType;
     typedef Index IndexType;
     typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+    typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
     typedef AdEllpack< Real, Device, Index > ThisType;
     typedef AdEllpack< Real, Devices::Host, Index > HostType;
     typedef AdEllpack< Real, Devices::Cuda, Index > CudaType;
@@ -93,7 +94,7 @@ public:
 
     String getTypeVirtual() const;
 
-    void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+    void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
     IndexType getWarp( const IndexType row ) const;
 
@@ -155,7 +156,7 @@ public:
     void print( std::ostream& str ) const;
 
     bool balanceLoad( const RealType average,
-                      const CompressedRowLengthsVector& rowLengths,
+                      ConstCompressedRowLengthsVectorView rowLengths,
                       warpList* list );
 
     void computeWarps( const IndexType SMs,
@@ -166,7 +167,7 @@ public:
 
     void performRowTest();
 
-    void performRowLengthsTest( const CompressedRowLengthsVector& rowLengths );
+    void performRowLengthsTest( ConstCompressedRowLengthsVectorView rowLengths );
 
     IndexType getTotalLoad() const;
 
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index 55719e362dc14577ee683f7b49c71e6006ea6e51..daab3b8cdc351459120436186cb3060aeee54d8f 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -182,7 +182,7 @@ template< typename Real,
           typename Index >
 void
 AdEllpack< Real, Device, Index >::
-setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
     TNL_ASSERT( this->getRows() > 0, );
     TNL_ASSERT( this->getColumns() > 0, );
@@ -250,7 +250,7 @@ Index AdEllpack< Real, Device, Index >::getTotalLoad() const
 template< typename Real,
           typename Device,
           typename Index >
-void AdEllpack< Real, Device, Index >::performRowLengthsTest( const CompressedRowLengthsVector& rowLengths )
+void AdEllpack< Real, Device, Index >::performRowLengthsTest( ConstCompressedRowLengthsVectorView rowLengths )
 {
     bool found = false;
     for( IndexType row = 0; row < this->getRows(); row++ )
@@ -318,6 +318,8 @@ Index AdEllpack< Real, Device, Index >::getWarp( const IndexType row ) const
             ( ( this->rowOffset.getElement( searchedWarp ) < row ) && ( this->rowOffset.getElement( searchedWarp + 1 ) >= row ) ) )
             return searchedWarp;
     }
+    // FIXME: non-void function always has to return something sensible
+    throw "bug - row was not found";
 }
 
 template< typename Real,
@@ -474,7 +476,6 @@ bool AdEllpack< Real, Device, Index >::setRow( const IndexType row,
         warp++;
 
     bool found = false;
-    IndexType length = 0;
     IndexType elementPtr;
     IndexType elPtr = 0;
     while( ( !found ) && ( elPtr < elements ) )
@@ -694,8 +695,8 @@ template< typename Real,
           typename Device,
           typename Index >
 bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average,
-                                                             const CompressedRowLengthsVector& rowLengths,
-                                                             warpList* list )
+                                                    ConstCompressedRowLengthsVectorView rowLengths,
+                                                    warpList* list )
 {
     IndexType offset, rowOffset, localLoad, reduceMap[ 32 ];
 
diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h
index ef5f90d47397c4e8314e5dc64b54c51f60a8f013..b724a0ada17dc4efb2c1d0907054715107da01a4 100644
--- a/src/TNL/Matrices/BiEllpack.h
+++ b/src/TNL/Matrices/BiEllpack.h
@@ -36,6 +36,7 @@ public:
 	typedef Device DeviceType;
 	typedef Index IndexType;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
 	typedef BiEllpack< Real, Device, Index > ThisType;
@@ -51,7 +52,7 @@ public:
 	void setDimensions( const IndexType rows,
 	                    const IndexType columns );
 
-	void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
 	IndexType getRowLength( const IndexType row ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric.h b/src/TNL/Matrices/BiEllpackSymmetric.h
index c8d84bbc530ec30b13ce1e4953f972574ba10552..e44921fe837e7d0f0060df78af167c12ea1c102e 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric.h
@@ -27,6 +27,7 @@ public:
 	typedef Device DeviceType;
 	typedef Index IndexType;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
 	typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
 	typedef BiEllpackSymmetric< Real, Device, Index > ThisType;
@@ -41,7 +42,7 @@ public:
 
 	void setDimensions( const IndexType rows, const IndexType columns );
 
-	void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
 	IndexType getRowLength( const IndexType row ) const;
 
diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
index 9a7f380eebe7c5b54785055060594967275faa65..5b6f94b57b571963f5ba18c3c4c7e8fb7700fa99 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -96,7 +96,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int StripSize >
-void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
     if( this->getRows() % this->warpSize != 0 )
         this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
@@ -108,7 +108,7 @@ void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengt
     for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
         this->groupPointers.setElement( i, 0 );
 
-   // TODO: fix this
+   // FIXME: cannot sort a const vector!
     //DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
     //DeviceDependentCode::computeColumnSizes( *this, rowLengths );
 
@@ -149,7 +149,7 @@ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( c
                        << " this->getName() = " << std::endl );
 
     IndexType strip = row / this->warpSize;
-    IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip;
+    IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
     IndexType numberOfGroups = this->logWarpSize + 1;
     IndexType bisection = 1;
     for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
@@ -158,6 +158,12 @@ Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( c
             return ( numberOfGroups - i );
         bisection *= 2;
     }
+    // FIXME: non-void function always has to return something sensible
+#ifndef __CUDA_ARCH__
+    throw "bug - row was not found";
+#else
+    TNL_ASSERT_TRUE( false, "bug - row was not found" );
+#endif
 }
 
 template< typename Real,
@@ -390,6 +396,7 @@ bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setRow( const IndexTy
     }
     if( thisElementPtr == numberOfElements )
         return true;
+    return false;
 }
 
 template< typename Real,
@@ -576,8 +583,8 @@ __cuda_callable__
 Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
                                                                             const Index group ) const
 {
-    return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
-            - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group );
+    return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
+            - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 80b182db1e61cb73d4a82a20d5772935f06436e0..ea5e1efb9463915ec724b28b38abbfe64ac596b0 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -99,7 +99,7 @@ template< typename Real,
 	  int StripSize >
 void
 BiEllpack< Real, Device, Index, StripSize >::
-setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
 	if( this->getRows() % this->warpSize != 0 )
 		this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) );
@@ -112,8 +112,9 @@ setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
 	for( IndexType i = 0; i < this->groupPointers.getSize(); i++ )
 		this->groupPointers.setElement( i, 0 );
 
-	DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
-	DeviceDependentCode::computeColumnSizes( *this, rowLengths );
+   // FIXME: cannot sort a const vector!
+	//DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
+	//DeviceDependentCode::computeColumnSizes( *this, rowLengths );
 
 	this->groupPointers.computeExclusivePrefixSum();
 
@@ -153,7 +154,7 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde
 	                   << " this->getName() = " << std::endl; );
 
 	IndexType strip = row / this->warpSize;
-	IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip;
+	IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip;
 	IndexType numberOfGroups = this->logWarpSize + 1;
 	IndexType bisection = 1;
 	for( IndexType i = 0; i < this->logWarpSize + 1; i++ )
@@ -162,6 +163,12 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde
 			return ( numberOfGroups - i );
 		bisection *= 2;
 	}
+    // FIXME: non-void function always has to return something sensible
+#ifndef __CUDA_ARCH__
+    throw "bug - row was not found";
+#else
+    TNL_ASSERT_TRUE( false, "bug - row was not found" );
+#endif
 }
 
 template< typename Real,
@@ -396,6 +403,7 @@ setRow( const IndexType row,
 	}
 	if( thisElementPtr == numberOfElements )
 		return true;
+   return false;
 }
 
 template< typename Real,
@@ -580,8 +588,8 @@ __cuda_callable__
 Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip,
 																 	 	    const Index group ) const
 {
-    return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 )
-            - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group );
+    return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ]
+            - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ];
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/COOMatrix.h b/src/TNL/Matrices/COOMatrix.h
index a74a12f0496d3296315d5f308c43d1da487aa719..e67351ab9092c3ea3c504781b4409f4053601caa 100644
--- a/src/TNL/Matrices/COOMatrix.h
+++ b/src/TNL/Matrices/COOMatrix.h
@@ -34,6 +34,7 @@ public:
 	typedef Device DeviceType;
 	typedef Index IndexType;
 	typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
 	typedef COOMatrix< Real, Device, Index > ThisType;
 	typedef COOMatrix< Real, Devices::Host, Index > HostType;
 	typedef COOMatrix< Real, Devices::Cuda, Index > CudaType;
@@ -51,7 +52,7 @@ public:
 
 	IndexType getNumberOfUsedValues() const;
 
-	bool setCompressedRowLengths(const CompressedRowLengthsVector& rowLengths);
+	bool setCompressedRowLengths(ConstCompressedRowLengthsVectorView rowLengths);
 
 	void getRowLengths(Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths) const;
 
diff --git a/src/TNL/Matrices/COOMatrix_impl.h b/src/TNL/Matrices/COOMatrix_impl.h
index a5fc599abf91da2da7b9447fa69631d948a9403e..05439634140745c82bfb23f2df2c0042e7b3741a 100644
--- a/src/TNL/Matrices/COOMatrix_impl.h
+++ b/src/TNL/Matrices/COOMatrix_impl.h
@@ -84,7 +84,7 @@ Index COOMatrix< Real, Device, Index >::getNumberOfUsedValues() const
 template< typename Real,
 		  typename Device,
 		  typename Index >
-bool COOMatrix< Real, Device, Index >::setCompressedRowLengths(const CompressedRowLengthsVector& rowLengths)
+bool COOMatrix< Real, Device, Index >::setCompressedRowLengths(ConstCompressedRowLengthsVectorView rowLengths)
 {
 	IndexType size = 0;
 	for(IndexType row = 0; row < this->getRows(); row++)
diff --git a/src/TNL/Matrices/CSR.h b/src/TNL/Matrices/CSR.h
index f6d4c6d31b895476e16bc2ce2616a9bc77fce505..ef7ba5d6f925d2e56e0df64b2951fe1752a7f84f 100644
--- a/src/TNL/Matrices/CSR.h
+++ b/src/TNL/Matrices/CSR.h
@@ -45,6 +45,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef CSR< Real, Device, Index > ThisType;
    typedef CSR< Real, Devices::Host, Index > HostType;
    typedef CSR< Real, Devices::Cuda, Index > CudaType;
@@ -68,7 +69,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 1516e932231c900c6e56b5442b88133cd5267a1a..b4dff85470bf86021c69478138eb3be86f74d593 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Matrices/CSR.h>
-#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
 
 #ifdef HAVE_CUSPARSE
@@ -87,7 +87,7 @@ void CSR< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-void CSR< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -99,7 +99,7 @@ void CSR< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLen
     * necessary length of the vectors this->values
     * and this->columnIndexes.
     */
-   Containers::Vector< IndexType, DeviceType, IndexType > rowPtrs;
+   Containers::VectorView< IndexType, DeviceType, IndexType > rowPtrs;
    rowPtrs.bind( this->rowPointers.getData(), this->getRows() );
    rowPtrs = rowLengths;
    this->rowPointers.setElement( this->rows, 0 );
diff --git a/src/TNL/Matrices/ChunkedEllpack.h b/src/TNL/Matrices/ChunkedEllpack.h
index 8c4a47a320b6a38fff9ab7c9622f79c3caae6b4b..35bbfa89799eff2b248283cda4ef141bcf7eb039 100644
--- a/src/TNL/Matrices/ChunkedEllpack.h
+++ b/src/TNL/Matrices/ChunkedEllpack.h
@@ -77,6 +77,7 @@ public:
    typedef Index IndexType;
    typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
    typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef ChunkedEllpack< Real, Device, Index > ThisType;
    typedef ChunkedEllpack< Real, Devices::Host, Index > HostType;
    typedef ChunkedEllpack< Real, Devices::Cuda, Index > CudaType;
@@ -97,7 +98,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -253,9 +254,9 @@ public:
 
 protected:
 
-   void resolveSliceSizes( const Containers::Vector< Index, Devices::Host, Index >& rowLengths );
+   void resolveSliceSizes( ConstCompressedRowLengthsVectorView rowLengths );
 
-   bool setSlice( const CompressedRowLengthsVector& rowLengths,
+   bool setSlice( ConstCompressedRowLengthsVectorView rowLengths,
                   const IndexType sliceIdx,
                   IndexType& elementsToAllocation );
 
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 1a47fe4e608fa2d5087eb404730356517316a913..20dbfa68349b7f23ad66e36c300d1894ea7e40be 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -95,7 +95,7 @@ void ChunkedEllpack< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-void ChunkedEllpack< Real, Device, Index >::resolveSliceSizes( const Containers::Vector< Index, Devices::Host, Index >& rowLengths )
+void ChunkedEllpack< Real, Device, Index >::resolveSliceSizes( ConstCompressedRowLengthsVectorView rowLengths )
 {
    /****
     * Iterate over rows and allocate slices so that each slice has
@@ -132,7 +132,7 @@ void ChunkedEllpack< Real, Device, Index >::resolveSliceSizes( const Containers:
 template< typename Real,
           typename Device,
           typename Index >
-bool ChunkedEllpack< Real, Device, Index >::setSlice( const CompressedRowLengthsVector& rowLengths,
+bool ChunkedEllpack< Real, Device, Index >::setSlice( ConstCompressedRowLengthsVectorView rowLengths,
                                                                const IndexType sliceIndex,
                                                                IndexType& elementsToAllocation )
 {
@@ -215,7 +215,7 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( const CompressedRowLengths
 template< typename Real,
           typename Device,
           typename Index >
-void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -1336,7 +1336,7 @@ class ChunkedEllpackDeviceDependentCode< Devices::Host >
       template< typename Real,
                 typename Index >
       static void resolveSliceSizes( ChunkedEllpack< Real, Device, Index >& matrix,
-                                     const typename ChunkedEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
+                                     typename ChunkedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
          matrix.resolveSliceSizes( rowLengths );
       }
@@ -1397,7 +1397,7 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda >
       template< typename Real,
                 typename Index >
       static void resolveSliceSizes( ChunkedEllpack< Real, Device, Index >& matrix,
-                                     const typename ChunkedEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
+                                     typename ChunkedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
       }
  
diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h
index 2de30b3f96f5cf830f4f8c476bbec8025b00ad66..351e8a8c7631feb1cf952cb8dd57762723a29bfc 100644
--- a/src/TNL/Matrices/Dense.h
+++ b/src/TNL/Matrices/Dense.h
@@ -40,6 +40,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Matrix< Real, Device, Index >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Matrix< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef Dense< Real, Device, Index > ThisType;
    typedef Dense< Real, Devices::Host, Index > HostType;
    typedef Dense< Real, Devices::Cuda, Index > CudaType;
@@ -66,7 +67,7 @@ public:
    /****
     * This method is only for the compatibility with the sparse matrices.
     */
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    /****
     * Returns maximal number of the nonzero matrix elements that can be stored
diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h
index 32958f08b2e2551076f5a5e50dbdbc3cff50ba13..e8e9ed9f1c8b54db08345b910939c33905c83bd7 100644
--- a/src/TNL/Matrices/Dense_impl.h
+++ b/src/TNL/Matrices/Dense_impl.h
@@ -83,7 +83,7 @@ void Dense< Real, Device, Index >::setLike( const Dense< Real2, Device2, Index2
 template< typename Real,
           typename Device,
           typename Index >
-void Dense< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void Dense< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
 }
 
diff --git a/src/TNL/Matrices/Ellpack.h b/src/TNL/Matrices/Ellpack.h
index 38333685bfbc59cd94dec2197463ca40557a57e5..1646db1c5c8b37bb635af0ee3501afd3fce6e431 100644
--- a/src/TNL/Matrices/Ellpack.h
+++ b/src/TNL/Matrices/Ellpack.h
@@ -36,6 +36,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef Ellpack< Real, Device, Index > ThisType;
@@ -58,7 +59,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    void setConstantCompressedRowLengths( const IndexType& rowLengths );
 
diff --git a/src/TNL/Matrices/EllpackSymmetric.h b/src/TNL/Matrices/EllpackSymmetric.h
index 4d76a781756ac6abc1e96d4aef1a55bd88d34033..0720d9d5293f5190edfb3411807ae29b380d91d3 100644
--- a/src/TNL/Matrices/EllpackSymmetric.h
+++ b/src/TNL/Matrices/EllpackSymmetric.h
@@ -28,6 +28,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef EllpackSymmetric< Real, Device, Index > ThisType;
@@ -44,7 +45,7 @@ class EllpackSymmetric : public Sparse< Real, Device, Index >
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    bool setConstantRowLengths( const IndexType& rowLengths );
 
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph.h b/src/TNL/Matrices/EllpackSymmetricGraph.h
index 7b11b6b159adcf658f62f5af76941e7989636578..3a282c796be439209184023fb76aa692ff1e4294 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph.h
@@ -28,6 +28,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef EllpackSymmetricGraph< Real, Device, Index > ThisType;
@@ -44,7 +45,7 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    bool setConstantRowLengths( const IndexType& rowLengths );
 
@@ -152,10 +153,8 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
                   const int color ) const;
 #endif
 
-   __cuda_callable__
    void computePermutationArray();
 
-   __cuda_callable__
    bool rearrangeMatrix( bool verbose );
 
    bool save( File& file ) const;
@@ -181,21 +180,20 @@ class EllpackSymmetricGraph : public Sparse< Real, Device, Index >
    __cuda_callable__
    Index getRowsOfColor( IndexType color ) const;
 
-   __cuda_callable__
    void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix );
 
    __cuda_callable__
-   Containers::Vector< Index, Device, Index > getPermutationArray();
+   Containers::Vector< Index, Device, Index >& getPermutationArray();
 
    __cuda_callable__
-   Containers::Vector< Index, Device, Index > getInversePermutation();
+   Containers::Vector< Index, Device, Index >& getInversePermutation();
 
    __cuda_callable__
-   Containers::Vector< Index, Device, Index > getColorPointers();
+   Containers::Vector< Index, Device, Index >& getColorPointers();
 
    protected:
 
-   bool allocateElements();
+   void allocateElements();
 
    IndexType rowLengths, alignedRows;
 
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index f8b817d9a7cc048971ae9e173566c5d2998bd489..799d07281e62ae441c39162ee2fc5271997effac 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -83,7 +83,7 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp
 template< typename Real,
           typename Device,
           typename Index >
-void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT( this->getRows() > 0, );
    TNL_ASSERT( this->getColumns() > 0, );
@@ -102,7 +102,7 @@ template< typename Real,
 __cuda_callable__
 Index EllpackSymmetricGraph< Real, Device, Index >::getRowsOfColor( IndexType color ) const
 {
-   return this->colorPointers.getElement( color + 1 ) - this->colorPointers.getElement( color );
+   return this->colorPointers[ color + 1 ] - this->colorPointers[ color ];
 }
 
 /*
@@ -174,7 +174,6 @@ void EllpackSymmetricGraph< Real, Device, Index >::computeColorsVector( Containe
 template< typename Real,
           typename Device,
           typename Index >
-__cuda_callable__
 void EllpackSymmetricGraph< Real, Device, Index >::computePermutationArray()
 {
    // init vector of colors and permutation array
@@ -238,7 +237,6 @@ void EllpackSymmetricGraph< Real, Device, Index >::verifyPermutationArray()
 template< typename Real,
           typename Device,
           typename Index >
-__cuda_callable__
 bool EllpackSymmetricGraph< Real, Device, Index >::rearrangeMatrix( bool verbose )
 {
    // first we need to know permutation
@@ -296,7 +294,8 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-Containers::Vector< Index, Device, Index > EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray()
+Containers::Vector< Index, Device, Index >&
+EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray()
 {
     return this->permutationArray;
 }
@@ -305,7 +304,8 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-Containers::Vector< Index, Device, Index > EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation()
+Containers::Vector< Index, Device, Index >&
+EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation()
 {
     return this->inversePermutationArray;
 }
@@ -314,7 +314,8 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-Containers::Vector< Index, Device, Index > EllpackSymmetricGraph< Real, Device, Index >::getColorPointers()
+Containers::Vector< Index, Device, Index >&
+EllpackSymmetricGraph< Real, Device, Index >::getColorPointers()
 {
     return this->colorPointers;
 }
@@ -322,7 +323,6 @@ Containers::Vector< Index, Device, Index > EllpackSymmetricGraph< Real, Device,
 template< typename Real,
           typename Device,
           typename Index >
-__cuda_callable__
 void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix )
 {
     //  TODO: fix
@@ -331,17 +331,17 @@ void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSy
     this->rearranged = true;
     this->rowLengths = matrix.getRowLengthsInt();
     this->alignedRows = matrix.getAlignedRows();
-    Containers::Vector< Index, Devices::Host, Index > colorPointers = matrix.getColorPointers();
+    Containers::Vector< Index, Devices::Host, Index >& colorPointers = matrix.getColorPointers();
     this->colorPointers.setSize( colorPointers.getSize() );
     for( IndexType i = 0; i < colorPointers.getSize(); i++ )
         this->colorPointers.setElement( i, colorPointers[ i ] );
 
-    Containers::Vector< Index,Devices::Host, Index > permutationArray = matrix.getPermutationArray();
+    Containers::Vector< Index,Devices::Host, Index >& permutationArray = matrix.getPermutationArray();
     this->permutationArray.setSize( permutationArray.getSize() );
     for( IndexType i = 0; i < permutationArray.getSize(); i++ )
         this->permutationArray.setElement( i, permutationArray[ i ] );
 
-    Containers::Vector< Index, Devices::Host, Index > inversePermutation = matrix.getInversePermutation();
+    Containers::Vector< Index, Devices::Host, Index >& inversePermutation = matrix.getInversePermutation();
     this->inversePermutationArray.setSize( inversePermutation.getSize() );
     for( IndexType i = 0; i < inversePermutation.getSize(); i++ )
         this->inversePermutationArray.setElement( i, inversePermutation[ i ] );
@@ -363,7 +363,7 @@ bool EllpackSymmetricGraph< Real, Device, Index >::setConstantRowLengths( const
    TNL_ASSERT( rowLengths > 0, std::cerr << " rowLengths = " << rowLengths );
    this->rowLengths = rowLengths;
    if( this->rows > 0 )
-      return allocateElements();
+      allocateElements();
    return true;
 }
 
@@ -445,37 +445,37 @@ bool EllpackSymmetricGraph< Real, Device, Index > :: addElementFast( const Index
                                                                      const RealType& thisElementMultiplicator )
 {
    typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType;
-   IndexType i = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) );
-   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) );
+   IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] );
+   const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] );
    const IndexType step = DDCType::getElementStep( *this );
 
    while( i < rowEnd &&
-         this->columnIndexes.getElement( i ) < column &&
-         this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step;
+         this->columnIndexes[ i ] < column &&
+         this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step;
    if( i == rowEnd )
       return false;
-   if( this->columnIndexes.getElement( i ) == column )
+   if( this->columnIndexes[ i ] == column )
    {
-      this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value);
+      this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value;
       return true;
    }
    else
-      if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() ) // artificial zero
+      if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero
       {
-         this->columnIndexes.setElement( i, column);
-         this->values.setElement( i, value);
+         this->columnIndexes[ i ] = column;
+         this->values[ i ] = value;
       }
       else
       {
          Index j = rowEnd - step;
          while( j > i )
          {
-            this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) );
-            this->values.setElement( j, this->values.getElement( j - step ) );
+            this->columnIndexes[ j ] = this->columnIndexes[ j - step ];
+            this->values[ j ] = this->values[ j - step ];
             j -= step;
          }
-         this->columnIndexes.setElement( i, column );
-         this->values.setElement( i, value );
+         this->columnIndexes[ i ] = column;
+         this->values[ i ] = value;
       }
    return true;
 }
@@ -783,6 +783,7 @@ bool EllpackSymmetricGraph< Real, Device, Index >::help( bool verbose )
 {
     if( !this->rearranged )
         return this->rearrangeMatrix( verbose );
+    return true;
 }
 
 template< typename Real,
@@ -810,7 +811,7 @@ void EllpackSymmetricGraph< Real, Device, Index >::print( std::ostream& str ) co
 template< typename Real,
           typename Device,
           typename Index >
-bool EllpackSymmetricGraph< Real, Device, Index >::allocateElements()
+void EllpackSymmetricGraph< Real, Device, Index >::allocateElements()
 {
    Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths );
 }
diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index c353cbb392191c5a787263e808a0f59110d5cad4..42202a883c0c887317aaf68904dfa0bddf27a646 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -66,7 +66,7 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row
 template< typename Real,
           typename Device,
           typename Index >
-void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT( this->getRows() > 0, );
    TNL_ASSERT( this->getColumns() > 0, );
@@ -85,7 +85,7 @@ bool EllpackSymmetric< Real, Device, Index >::setConstantRowLengths( const Index
              std::cerr << " rowLengths = " << rowLengths );
    this->rowLengths = rowLengths;
    if( this->rows > 0 )
-      return allocateElements();
+      allocateElements();
    return true;
 }
 
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 9801b6bcac54bdff89337428d3b83968ebf3759a..6186206439474d97d8edced12b4671b257b6f0ed 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -84,7 +84,7 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-void Ellpack< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -933,7 +933,7 @@ class EllpackDeviceDependentCode< Devices::Cuda >
             //Devices::Cuda::freeFromDevice( kernel_inVector );
             //Devices::Cuda::freeFromDevice( kernel_outVector );
             TNL_CHECK_CUDA_DEVICE;
-            cudaThreadSynchronize();
+            cudaDeviceSynchronize();
          #endif
  
       }
diff --git a/src/TNL/Matrices/Matrix.h b/src/TNL/Matrices/Matrix.h
index a30d8c2a420bc3d0cae2ad07977e72717cc5431f..b7c205998f77d442dbaf4efea7021f1f169b15cb 100644
--- a/src/TNL/Matrices/Matrix.h
+++ b/src/TNL/Matrices/Matrix.h
@@ -13,6 +13,7 @@
 #include <TNL/Object.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
 
 namespace TNL {
 namespace Matrices {   
@@ -27,6 +28,8 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
+   typedef Containers::VectorView< IndexType, DeviceType, IndexType > CompressedRowLengthsVectorView;
+   typedef Containers::VectorView< const IndexType, DeviceType, IndexType > ConstCompressedRowLengthsVectorView;
    typedef Containers::Vector< RealType, DeviceType, IndexType > ValuesVector;
 
    Matrix();
@@ -34,13 +37,15 @@ public:
    virtual void setDimensions( const IndexType rows,
                                  const IndexType columns );
 
-   virtual void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths ) = 0;
+   virtual void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) = 0;
 
    virtual IndexType getRowLength( const IndexType row ) const = 0;
 
    // TODO: implementation is not parallel
    // TODO: it would be nice if padding zeros could be stripped
-   virtual void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const;
+   void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const;
+
+   virtual void getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const;
 
    template< typename Real2, typename Device2, typename Index2 >
    void setLike( const Matrix< Real2, Device2, Index2 >& matrix );
diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h
index 9bfa472c5d7fcb6a69828780b44d39f022b6b446..6ae4dd07e47cda53e23e6c9eea5666ed53b4dba3 100644
--- a/src/TNL/Matrices/MatrixOperations.h
+++ b/src/TNL/Matrices/MatrixOperations.h
@@ -340,8 +340,7 @@ public:
       // TODO: use static storage, e.g. from the CudaReductionBuffer, to avoid frequent reallocations
       Containers::Vector< RealType, Devices::Cuda, IndexType > xDevice;
       xDevice.setSize( n );
-      if( ! Containers::Algorithms::ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< RealType, RealType, IndexType >( xDevice.getData(), x, n ) )
-         throw 1;
+      Containers::Algorithms::ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< RealType, RealType, IndexType >( xDevice.getData(), x, n );
 
       // desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
       const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h
index 9728020053edca6d340198f666accc32a4aca46d..cb685917fd676e0d062f400ab0fd42836dbe1e05 100644
--- a/src/TNL/Matrices/Matrix_impl.h
+++ b/src/TNL/Matrices/Matrix_impl.h
@@ -43,6 +43,15 @@ template< typename Real,
 void Matrix< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const
 {
    rowLengths.setSize( this->getRows() );
+   getCompressedRowLengths( CompressedRowLengthsVectorView( rowLengths ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+void Matrix< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const
+{
+   TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" );
    for( IndexType row = 0; row < this->getRows(); row++ )
       rowLengths.setElement( row, this->getRowLength( row ) );
 }
diff --git a/src/TNL/Matrices/Multidiagonal.h b/src/TNL/Matrices/Multidiagonal.h
index 9b8f18779ceb9afa15e3e27f3610a2b0fa23fde6..cfa798e7a254916455db5c896640495bdbc504c1 100644
--- a/src/TNL/Matrices/Multidiagonal.h
+++ b/src/TNL/Matrices/Multidiagonal.h
@@ -37,6 +37,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Matrix< Real, Device, Index >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Matrix< Real, Device, Index >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef Multidiagonal< Real, Device, Index > ThisType;
    typedef Multidiagonal< Real, Devices::Host, Index > HostType;
    typedef Multidiagonal< Real, Devices::Cuda, Index > CudaType;
@@ -57,7 +58,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
diff --git a/src/TNL/Matrices/Multidiagonal_impl.h b/src/TNL/Matrices/Multidiagonal_impl.h
index 5f7228d698db1a47dbc62f2b540c08b1e3f9b86c..bd4c24691252ac425466906c549168248aca8244 100644
--- a/src/TNL/Matrices/Multidiagonal_impl.h
+++ b/src/TNL/Matrices/Multidiagonal_impl.h
@@ -83,7 +83,7 @@ void Multidiagonal< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-void Multidiagonal< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void Multidiagonal< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    /****
     * TODO: implement some check here similar to the one in the tridiagonal matrix
diff --git a/src/TNL/Matrices/SlicedEllpack.h b/src/TNL/Matrices/SlicedEllpack.h
index 815728d7a58d588a1791c1aa80b84bcd81da8f4b..6f68f2fa8aea4979b8f4685d2ee25d3039653ea7 100644
--- a/src/TNL/Matrices/SlicedEllpack.h
+++ b/src/TNL/Matrices/SlicedEllpack.h
@@ -41,8 +41,8 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   const typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::CompressedRowLengthsVector* rowLengths,
-                                                                                   int gridIdx );
+                                                                          typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
+                                                                          int gridIdx );
 #endif
 
 template< typename Real,
@@ -65,6 +65,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef SlicedEllpack< Real, Device, Index, SliceSize > ThisType;
@@ -88,7 +89,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -225,7 +226,7 @@ protected:
    // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
 
 public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
+   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
                                                         const IndexType sliceIdx );
 #endif
 };
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric.h b/src/TNL/Matrices/SlicedEllpackSymmetric.h
index 0ef1c26d8eeb3d55ea7bd2467d732e886f12c069..d9abb0de2ef664fd4032e8e5b0e00203093eb250 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetric.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetric.h
@@ -30,7 +30,7 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::CompressedRowLengthsVector* rowLengths,
+                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
                                                                                    int gridIdx );
 #endif
 
@@ -46,6 +46,7 @@ class SlicedEllpackSymmetric : public Sparse< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef SlicedEllpackSymmetric< Real, Device, Index > ThisType;
@@ -62,7 +63,7 @@ class SlicedEllpackSymmetric : public Sparse< Real, Device, Index >
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -196,7 +197,7 @@ class SlicedEllpackSymmetric : public Sparse< Real, Device, Index >
    // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
 
    public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
+   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
                                                         const IndexType sliceIdx );
 
 #endif
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph.h
index 3cab23d1db6613fc1a2b93c5561987cae9a0a15d..a2ab000957227a88a9655888cb417cea6498f56f 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph.h
@@ -30,8 +30,8 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                       const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                       int gridIdx );
+                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
+                                                                                        int gridIdx );
 #endif
 
 template< typename Real,
@@ -46,6 +46,7 @@ class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef SlicedEllpackSymmetricGraph< Real, Device, Index > ThisType;
@@ -62,7 +63,7 @@ class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -194,10 +195,8 @@ class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
 
     void copyFromHostToCuda( SlicedEllpackSymmetricGraph< Real, Devices::Host, Index, SliceSize >& matrix );
 
-   __cuda_callable__
    bool rearrangeMatrix( bool verbose = false );
 
-   __cuda_callable__
    void computePermutationArray();
 
    Containers::Vector< Index, Device, Index > getSlicePointers();
@@ -229,7 +228,7 @@ class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index >
    // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
 
    public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
+   __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
                                                         const IndexType sliceIdx );
 
 #endif
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
index 720796425bf1a296f005507302afb81d6d1544ab..9f09a21c506bd08110bc8360e17d3bf1f3a4097c 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
@@ -65,7 +65,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT( this->getRows() > 0, );
    TNL_ASSERT( this->getColumns() > 0, );
@@ -701,7 +701,6 @@ Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowL
 
    Index rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
    Index rowEnd = rowBegin + rowLength;
-   Index step = 1;
    Index length = 0;
    for( Index i = rowBegin; i < rowEnd; i++ )
       if( this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
@@ -764,7 +763,6 @@ bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rearrangeMat
         for( IndexType row = slice * SliceSize; row < (slice + 1) * SliceSize && row < this->getRows(); row++ )
         {
             IndexType rowBegin = slicePointerOrig + rowLengthOrig * ( row - slice * SliceSize );
-            IndexType rowEnd = rowBegin + rowLengthOrig;
             IndexType elementPointer = rowBegin;
 
             IndexType sliceNew = this->permutationArray.getElement( row ) / SliceSize;
@@ -971,8 +969,8 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
-                                                                                                               const IndexType sliceIdx )
+__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
+                                                                                                                    const IndexType sliceIdx )
 {
    Index rowIdx = sliceIdx * SliceSize;
    Index rowInSliceIdx( 0 );
@@ -1040,8 +1038,8 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Host >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-      static bool computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpackSymmetricGraph< Real, Device, Index >::RowLengthsVector& rowLengths,
+      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
+                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
                                                    Containers::Vector< Index, Device, Index >& sliceRowLengths,
                                                    Containers::Vector< Index, Device, Index >& slicePointers )
       {
@@ -1120,11 +1118,11 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
-                                                                                   int gridIdx )
+                                                                                        typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths,
+                                                                                        int gridIdx )
 {
    const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( *rowLengths, sliceIdx );
+   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
 
@@ -1233,8 +1231,8 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-      static bool computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpackSymmetricGraph< Real, Device, Index >::RowLengthsVector& rowLengths,
+      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix,
+                                                   typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths,
                                                    Containers::Vector< Index, Device, Index >& sliceRowLengths,
                                                    Containers::Vector< Index, Device, Index >& slicePointers )
       {
@@ -1242,7 +1240,6 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
          typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
          Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
-         CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
          dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
@@ -1253,11 +1250,10 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
                cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
             SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
-                                                                               kernel_rowLengths,
+                                                                               rowLengths,
                                                                                gridIdx );
          }
          Devices::Cuda::freeFromDevice( kernel_matrix );
-         Devices::Cuda::freeFromDevice( kernel_rowLengths );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
index 14af7673483b7a153ca48f604850517e2ffe8ecb..402ac5a6c8128ab67bbf2393528fb3a2b58b9077 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
@@ -64,7 +64,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT( this->getRows() > 0, );
    TNL_ASSERT( this->getColumns() > 0, );
@@ -75,7 +75,7 @@ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowL
    // TODO: Uncomment the next line and fix the compilation
    //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );
 
-   TNL_ASSERT( false, "code fix required" );
+   throw std::runtime_error("code fix required");
 
    this->maxRowLength = rowLengths.max();
 
@@ -693,8 +693,8 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
-                                                                                                                  const IndexType sliceIdx )
+__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
+                                                                                                               const IndexType sliceIdx )
 {
    Index rowIdx = sliceIdx * SliceSize;
    Index rowInSliceIdx( 0 );
@@ -743,6 +743,7 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host >
       template< typename Real,
                 typename Index,
                 int SliceSize >
+      __cuda_callable__
       static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
                                        const Index row,
                                        Index& rowBegin,
@@ -762,8 +763,8 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-      static bool computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpackSymmetric< Real, Device, Index >::RowLengthsVector& rowLengths )
+      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
+                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
          Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
          while( row < matrix.getRows() )
@@ -806,11 +807,11 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths,
+                                                                                   typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
                                                                                    int gridIdx )
 {
    const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( *rowLengths, sliceIdx );
+   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
 
@@ -862,14 +863,13 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-      static bool computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpackSymmetric< Real, Device, Index >::RowLengthsVector& rowLengths )
+      static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix,
+                                                   typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
 #ifdef HAVE_CUDA
          typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
          Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
-         CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
          dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
@@ -880,11 +880,10 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
                cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
             SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
-                                                                               kernel_rowLengths,
+                                                                               rowLengths,
                                                                                gridIdx );
          }
          Devices::Cuda::freeFromDevice( kernel_matrix );
-         Devices::Cuda::freeFromDevice( kernel_rowLengths );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h
index 2ff01b49c51943c7411626257a57f273c6880b05..95a601a00a01ead3f11d0cd0ca0f96a0373b9606 100644
--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -82,7 +82,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-void SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" );
    TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" );
@@ -772,8 +772,8 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-__device__ void SlicedEllpack< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( const CompressedRowLengthsVector& rowLengths,
-                                                                                                               const IndexType sliceIdx )
+__device__ void SlicedEllpack< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths,
+                                                                                                      const IndexType sliceIdx )
 {
    Index rowIdx = sliceIdx * SliceSize;
    Index rowInSliceIdx( 0 );
@@ -843,7 +843,7 @@ class SlicedEllpackDeviceDependentCode< Devices::Host >
                 typename Index,
                 int SliceSize >
       static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
+                                                   typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
          Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
          while( row < matrix.getRows() )
@@ -888,11 +888,11 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix,
-                                                                                   const typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::CompressedRowLengthsVector* rowLengths,
-                                                                                   int gridIdx )
+                                                                          typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
+                                                                          int gridIdx )
 {
    const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
-   matrix->computeMaximalRowLengthInSlicesCuda( *rowLengths, sliceIdx );
+   matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
 
@@ -984,13 +984,12 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
                 typename Index,
                 int SliceSize >
       static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
+                                                   typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
 #ifdef HAVE_CUDA
          typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
          Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
-         CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
          dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
@@ -1001,11 +1000,10 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
                cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
             SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
-                                                                               kernel_rowLengths,
+                                                                               rowLengths,
                                                                                gridIdx );
          }
          Devices::Cuda::freeFromDevice( kernel_matrix );
-         Devices::Cuda::freeFromDevice( kernel_rowLengths );
          TNL_CHECK_CUDA_DEVICE;
 #endif
          return true;
@@ -1053,7 +1051,7 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
             //Devices::Cuda::freeFromDevice( kernel_inVector );
             //Devices::Cuda::freeFromDevice( kernel_outVector );
             TNL_CHECK_CUDA_DEVICE;
-            cudaThreadSynchronize();
+            cudaDeviceSynchronize();
          #endif
       }
 
@@ -1094,8 +1092,8 @@ class SlicedEllpackDeviceDependentCode< Devices::MIC >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-            static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename SlicedEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths )
+      static bool computeMaximalRowLengthInSlices( SlicedEllpack< Real, Device, Index, SliceSize >& matrix,
+                                                   typename SlicedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths )
       {
          throw std::runtime_error("Not Implemented yet SlicedEllpackDeviceDependentCode< Devices::MIC >::computeMaximalRowLengthInSlices");
       }
diff --git a/src/TNL/Matrices/Sparse.h b/src/TNL/Matrices/Sparse.h
index 2a694826b94e9d757079f72942f3f810ce136885..110dd7a40a57372c9fc65ea94c6bac1adf3c0f54 100644
--- a/src/TNL/Matrices/Sparse.h
+++ b/src/TNL/Matrices/Sparse.h
@@ -26,7 +26,6 @@ class Sparse : public Matrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename Matrix< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector;
    typedef typename Matrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef Containers::Vector< IndexType, DeviceType, IndexType > ColumnIndexesVector;
    typedef Matrix< Real, Device, Index > BaseType;
@@ -34,8 +33,6 @@ class Sparse : public Matrix< Real, Device, Index >
 
    Sparse();
 
-   virtual void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths ) = 0;
-
    template< typename Real2, typename Device2, typename Index2 >
    void setLike( const Sparse< Real2, Device2, Index2 >& matrix );
 
diff --git a/src/TNL/Matrices/Tridiagonal.h b/src/TNL/Matrices/Tridiagonal.h
index 472cadffcd4194270d5218e3c0ea1415b2c7ae5c..153c3bdbc4b6235022a44bb297d09c7fe5cbc458 100644
--- a/src/TNL/Matrices/Tridiagonal.h
+++ b/src/TNL/Matrices/Tridiagonal.h
@@ -39,6 +39,7 @@ public:
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename Matrix< Real, Device, Index >::CompressedRowLengthsVector CompressedRowLengthsVector;
+   typedef typename Matrix< Real, Device, Index >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView;
    typedef Tridiagonal< Real, Device, Index > ThisType;
    typedef Tridiagonal< Real, Devices::Host, Index > HostType;
    typedef Tridiagonal< Real, Devices::Cuda, Index > CudaType;
@@ -58,7 +59,7 @@ public:
    void setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
+   void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h
index 66fe9d7e80a80f93a1d16f1e741008f4de6a0787..faee4815491672b467fec15774f9a7cacd5efa6c 100644
--- a/src/TNL/Matrices/Tridiagonal_impl.h
+++ b/src/TNL/Matrices/Tridiagonal_impl.h
@@ -75,7 +75,7 @@ void Tridiagonal< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-void Tridiagonal< Real, Device, Index >::setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
+void Tridiagonal< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    if( rowLengths[ 0 ] > 2 )
       throw std::logic_error( "Too many non-zero elements per row in a tri-diagonal matrix." );
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
index 5a0181a05b5599569605b9f762887a70db10f68b..2a6d35bf3aaac41fb4be603672f837171a2fb010 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
@@ -11,9 +11,7 @@
 #pragma once
 
 #include <cstdlib>
-#include <TNL/StaticVectorFor.h>
 #include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Exceptions/UnsupportedDimension.h>
 
 #include <iostream>
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
index ed43e38b265d353936bfb09f907096695cea5af4..d2f7a18edb5a59beb113a8b6710a28001c9a9c5b 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h
@@ -57,7 +57,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
         newMesh->setOrigin(origin+TNL::Containers::Scale(spaceSteps,localBegin));
         
         File meshFile;
-        meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
+        bool ok=meshFile.open( fileName+String("-mesh-")+distrGrid->printProcessCoords()+String(".tnl"),IOMode::write);
+        TNL_ASSERT_TRUE(ok,"Not able to open mesh file to write");
         newMesh->save( meshFile );
         meshFile.close();
 
@@ -72,7 +73,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
         CopyEntitiesHelper<MeshFunctionType>::Copy(meshFunction,newMeshFunction,localBegin,zeroCoord,localSize);
 
         File file;
-        file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
+        ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::write );
+        TNL_ASSERT_TRUE(ok,"Not able to open file to write");
         bool ret=newMeshFunction.save(file);
         file.close();
 
@@ -110,7 +112,8 @@ class DistributedGridIO<Functions::MeshFunction<MeshType>,LocalCopy,Device>
         zeroCoord.setValue(0);        
 
         File file;
-        file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
+        bool ok=file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), IOMode::read );
+        TNL_ASSERT_TRUE(ok,"Not able to open file to read");
         bool result=newMeshFunction.boundLoad(file);
         file.close();
         CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
@@ -151,12 +154,13 @@ class DistributedGridIO_MPIIOBase
        MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
 
 	   MPI_File file;
-       MPI_File_open( group,
+      int ok=MPI_File_open( group,
                       const_cast< char* >( fileName.getString() ),
                       MPI_MODE_CREATE | MPI_MODE_WRONLY,
                       MPI_INFO_NULL,
                       &file);
-
+      TNL_ASSERT_EQ(ok,0,"Open file falied");
+      
 		int written=save(file,meshFunction, data,0);
 
         MPI_File_close(&file);
@@ -322,13 +326,14 @@ class DistributedGridIO_MPIIOBase
         MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup()));
 
         MPI_File file;
-        MPI_File_open( group,
+        int ok=MPI_File_open( group,
                       const_cast< char* >( fileName.getString() ),
                       MPI_MODE_RDONLY,
                       MPI_INFO_NULL,
                       &file );
+        TNL_ASSERT_EQ(ok,0,"Open file falied");
 
-		bool ret= load(file, meshFunction, data,0)>0;
+		  bool ret= load(file, meshFunction, data,0)>0;
 
         MPI_File_close(&file);
 
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index 194e68967ccc3368983ce32aeca22f3af1f4e2be..26ff692e4d0e36d9d6783fc91d4fb620ad832d5a 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -393,9 +393,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
             TNL_ASSERT( this->pointer, );
             TNL_ASSERT( this->cuda_pointer, );
             cudaMemcpy( (void*) this->cuda_pointer, (void*) this->pointer, sizeof( ObjectType ), cudaMemcpyHostToDevice );
-            if( ! TNL_CHECK_CUDA_DEVICE ) {
-               return false;
-            }
+            TNL_CHECK_CUDA_DEVICE;
             this->set_last_sync_state();
             return true;
          }
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 810d85e99125bea191cd112e88771b8ef2488322..42e46b257f9eb309f458f28ff3e46e591b03091b 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -544,9 +544,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 #endif
             TNL_ASSERT( this->cuda_pointer, );
             cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-            if( ! TNL_CHECK_CUDA_DEVICE ) {
-               return false;
-            }
+            TNL_CHECK_CUDA_DEVICE;
             this->set_last_sync_state();
             return true;
          }
diff --git a/src/TNL/Pointers/SmartPointersRegister.cpp b/src/TNL/Pointers/SmartPointersRegister.cpp
index cd57dfe3439b0846f65f0bf8bfaf573cfcbd6e91..01641661c1ae008e6517232fc0bb56572f09ff5a 100644
--- a/src/TNL/Pointers/SmartPointersRegister.cpp
+++ b/src/TNL/Pointers/SmartPointersRegister.cpp
@@ -44,7 +44,8 @@ bool SmartPointersRegister::synchronizeDevice( int deviceId )
       const auto & set = pointersOnDevices.at( deviceId );
       for( auto&& it : set )
          ( *it ).synchronize();
-      return TNL_CHECK_CUDA_DEVICE;
+      TNL_CHECK_CUDA_DEVICE;
+      return true;
    }
    catch( const std::out_of_range& ) {
       return false;
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 93a667c3553e65fc335c9a87e244d6e37dac536c..279f4535629ea144234040ef55570133a4dbeba8 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -238,8 +238,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          if( this->modified() )
          {
             cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-            if( ! TNL_CHECK_CUDA_DEVICE )
-               return false;
+            TNL_CHECK_CUDA_DEVICE;
             this->set_last_sync_state();
             return true;
          }
diff --git a/src/TNL/Solvers/Linear/CWYGMRES_impl.h b/src/TNL/Solvers/Linear/CWYGMRES_impl.h
index 1f7d06c7221e1bb7588e817bf7940d7e099fbbe3..4989f50207ef3a0c2a98545f7d66fc7e6a824db7 100644
--- a/src/TNL/Solvers/Linear/CWYGMRES_impl.h
+++ b/src/TNL/Solvers/Linear/CWYGMRES_impl.h
@@ -398,18 +398,14 @@ hauseholder_generate( DeviceVector& Y,
       // aux = Y_{i-1}^T * y_i
       RealType aux[ i ];
       Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
-      if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
+      Containers::Algorithms::Multireduction< DeviceType >::reduce
                ( scalarProduct,
                  i,
                  size,
                  Y.getData(),
                  ldSize,
                  y_i.getData(),
-                 aux ) )
-      {
-         std::cerr << "multireduction failed" << std::endl;
-         throw 1;
-      }
+                 aux );
 
       // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
       for( int k = 0; k < i; k++ ) {
@@ -442,12 +438,8 @@ hauseholder_apply_trunc( HostVector& out,
       // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
       RealType* host_yi = &YL[ i * (restarting_max + 1) ];
       RealType host_z[ i + 1 ];
-      if( ! Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ) ||
-          ! Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ) )
-      {
-         std::cerr << "Failed to copy part of device vectors y_i or z to host buffer." << std::endl;
-         throw 1;
-      }
+      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 );
+      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 );
       for( int k = 0; k <= i; k++ )
          out[ k ] = host_z[ k ] - host_yi[ k ] * aux;
    }
@@ -501,18 +493,14 @@ hauseholder_cwy_transposed( DeviceVector& z,
    // aux = Y_i^T * w
    RealType aux[ i + 1 ];
    Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
-   if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
+   Containers::Algorithms::Multireduction< DeviceType >::reduce
             ( scalarProduct,
               i + 1,
               size,
               Y.getData(),
               ldSize,
               w.getData(),
-              aux ) )
-   {
-      std::cerr << "multireduction failed" << std::endl;
-      throw 1;
-   }
+              aux );
 
    // aux = T_i^T * aux
    // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
diff --git a/src/TNL/Solvers/ODE/Euler_impl.h b/src/TNL/Solvers/ODE/Euler_impl.h
index debfeb7c204b77c97d694d456ad048ff700ce303..0b9eed1f8e9e7a356ae78d21469cde49bb88c6b1 100644
--- a/src/TNL/Solvers/ODE/Euler_impl.h
+++ b/src/TNL/Solvers/ODE/Euler_impl.h
@@ -204,7 +204,7 @@ void Euler< Problem > :: computeNewTimeLevel( DofVectorPointer& u,
                                                                       &_u[ gridOffset ],
                                                                       this->cudaBlockResidue.getData() );
          localResidue += this->cudaBlockResidue.sum();
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
       }
 #endif
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 53ccc9fd2d2f34aca2d53dbce957ee707756ed96..3323f4b742373738602d8a671fb2245cb10707d4 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -305,7 +305,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
       const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
 
       this->problem->getExplicitUpdate( time, tau, u, k1 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
       {
@@ -313,10 +313,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
          computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
       }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       this->problem->applyBoundaryConditions( time + tau_3, kAux );
       this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
       {
@@ -324,10 +324,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
          computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
       }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       this->problem->applyBoundaryConditions( time + tau_3, kAux );
       this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
       {
@@ -335,10 +335,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
          computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
       }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
       this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
       {
@@ -346,10 +346,10 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
          computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
       }
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
       this->problem->applyBoundaryConditions( time + tau, kAux );
       this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-      cudaThreadSynchronize();
+      cudaDeviceSynchronize();
 #endif
    }
 }
@@ -409,7 +409,7 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
                                                               &_k4[ gridOffset ],
                                                               &_k5[ gridOffset ],
                                                               &_kAux[ gridOffset ] );
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
          eps = std::max( eps, kAux->max() );
       }
 #endif
@@ -468,7 +468,7 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
                                                                        &_u[ gridOffset ],
                                                                        this->cudaBlockResidue.getData() );
          localResidue += this->cudaBlockResidue.sum();
-         cudaThreadSynchronize();
+         cudaDeviceSynchronize();
       }
       this->problem->applyBoundaryConditions( time, u );
 
diff --git a/src/TNL/StaticVectorFor.h b/src/TNL/StaticVectorFor.h
index 924a9c709601178e8850fb14102911d71eca5848..59af0fcb8256619d41014be4fa21023fee4679e2 100644
--- a/src/TNL/StaticVectorFor.h
+++ b/src/TNL/StaticVectorFor.h
@@ -16,34 +16,36 @@ namespace TNL {
 
 struct StaticVectorFor
 {
-    template < typename Index,
+   template< typename Index,
              typename Function,
              typename... FunctionArgs,
-             int dim>
-    static void exec( Containers::StaticVector<dim,Index> starts, Containers::StaticVector<dim,Index> ends, Function f, FunctionArgs... args )
-    {
-        Containers::StaticVector<dim,Index> index;
-        if(dim==1)
-        {
-            for(index[0]=starts[0]; index[0]< ends[0];index[0]++ )
-                 f( index, args... );
-        }
-
-        if(dim==2)
-        {
-            for(index[1]=starts[1]; index[1]< ends[1];index[1]++ )
-                for(index[0]=starts[0]; index[0]< ends[0];index[0]++ )
-                        f( index, args... );
-        }
-
-        if(dim==3)
-        {
-            for(index[2]=starts[2]; index[2]< ends[2];index[2]++ )
-                for(index[1]=starts[1]; index[1]< ends[1];index[1]++ )
-                    for(index[0]=starts[0]; index[0]< ends[0];index[0]++ )
-                        f( index, args... );
-        }
-    }
+             int dim >
+   static void exec( const Containers::StaticVector< dim, Index >& begin,
+                     const Containers::StaticVector< dim, Index >& end,
+                     Function f,
+                     FunctionArgs... args )
+   {
+      static_assert( 1 <= dim && dim <= 3, "unsupported dimension" );
+      Containers::StaticVector< dim, Index > index;
+
+      if( dim == 1 ) {
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+
+      if( dim == 2 ) {
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+               f( index, args... );
+      }
+
+      if( dim == 3 ) {
+         for( index[2] = begin[2]; index[2] < end[2]; index[2]++ )
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+   }
 };
 
 } // namespace TNL
diff --git a/src/TNL/legacy/CMakeLists.txt b/src/TNL/legacy/CMakeLists.txt
deleted file mode 100644
index e9f102901c3ae0cc5bef11560e2bd74b3381f947..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#ADD_SUBDIRECTORY( core )
-#ADD_SUBDIRECTORY( diff )
-#ADD_SUBDIRECTORY( mesh )
-#ADD_SUBDIRECTORY( solvers )
-
-SET( headers  )
-
-set( tnl_legacy_SOURCES 
-     ${tnl_legacy_mesh_SOURCES}
-     ${tnl_legacy_solvers_SOURCES} 
-     PARENT_SCOPE )
-
-INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/legacy )
\ No newline at end of file
diff --git a/src/TNL/legacy/benchmarks/ReorderCSR.cpp b/src/TNL/legacy/benchmarks/ReorderCSR.cpp
deleted file mode 100644
index 8418e50f8db66cab4533fd4813943c6f667815bd..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/ReorderCSR.cpp
+++ /dev/null
@@ -1,609 +0,0 @@
-// $Id: ReorderCSR.c,v 1.1 2010/11/04 15:35:14 asuzuki Exp asuzuki $ 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <getopt.h>
-
-#define EPS2 1.0e-20
-
-#define AMD_ORDERING
-#ifdef AMD_ORDERING
-#include "amd.h"
-#endif // #ifdef AMD_ORDERING
-
-typedef struct {
-  int col_ind;
-  double val;
-} csr_data;
-
-#define AMD      4
-#define DESCEND  1
-
-void init_CSR(double *val, int *col_ind, int *row_ptr, int nnz, int n);
-void print_CSR(char *st, 
-	       double *val, int *col_ind, int *row_ptr, int n);
-void make_order_index(int *ordering, int *row_ptr, int n, int descend);
-int comp_int(const void *_a, const void *_b);
-int comp_col_ind(const void *_a, const void *_b);
-
-void draw_csr(char *buf, int *row_ptr, int *csr_ind, int num_row);
-int count_padding(int *nonzeros, int *reordering, int num_row, int block_size);
-
-void makeRgCSR(double *val_new, int *col_ind_new, int *nonzeros, int *grp_ptr,
-	       double *val, int *col_ind, int *row_ptr, int group_size, int n);
-
-int countalignedRgCSR(int *row_ptr,  int group_size, int n);
-
-void SpMVCSR(double *y, double *x, double *val, 
-	     int *col_ind, int *row_ptr, int num_row);
-
-void  SpMVRgCSR(double *y, double *x, 
-		double *val, int *col_ind, int *nonzeros, int *grp_ptr, 
-		int block_size, int n);
-
-void reorder_csr_matrix(double *val_new, int *col_ind_new, int *row_ptr_new, 
-			double *val, int *col_ind, int *row_ptr, 
-			int *ordering, int *reordering, csr_data *work, 
-			int num_row);
-
-int main(int argc, char **argv)
-{
-  double *val, *val_coo, *val_new;
-  int *col_coo, *row_coo, *col_ind, *row_ptr, *nonzeros;
-  int *col_ind_new, *row_ptr_new;
-  int *ordering, *reordering;
-  double *val_rgcsr, *val_rgcsr_new;
-  double *x, *xx, *y, *y_rgcsr, *y_rgcsr_new;
-
-  int *col_ind_rgcsr, *nonzeros_rgcsr, *grp_ptr_rgcsr;
-  int *col_ind_rgcsr_new, *nonzeros_rgcsr_new, *grp_ptr_rgcsr_new;
-  int num_row, num_col, num_nz, num_nz0;
-  int jtmp;
-  FILE *fp;
-  char in_file[256], out_file[256], buf[256];
-  int block_size = 32;
-  int max_nonzeros, min_nonzeros, padding;
-  double mean_nonzeros;
-  int verbose = 0, graph_output = 0;
-  int method_ordering = DESCEND;
-  int flag_symmetric = 0;
-  int c;
-  // clear file name
-  in_file[0] = out_file[0] = 0;
-
-  while ((c = getopt(argc, argv, 
-		     "ADGvg:i:o:")) != EOF) {
-    switch(c) {
-    case 'G':
-      graph_output = 1;
-      break;
-    case 'i':
-      strcpy(in_file, optarg);
-      break;
-    case 'o':
-      strcpy(out_file, optarg);
-      break;
-    case 'A':
-      method_ordering = AMD;
-      break;
-    case 'D':
-      method_ordering = DESCEND;
-      break;
-    case 'g':
-      block_size = atoi(optarg);
-      break;
-    case 'v':
-      verbose = 1;
-      break;
-    case 'h':
-      fprintf(stderr, 
-	      "ReorderCSR -h -A -D -v -i [infile] -o [outfile] -g [group_size]\n");
-      break;
-    }
-  }
-
-  if (in_file[0] == 0 || out_file[0] == 0) {
-    fprintf(stderr, "matrix file name is incorrect\n");
-  }
-  if((fp = fopen(in_file, "r")) == NULL) {
-    exit(-1);
-  }
-
-  while (1) {
-    fgets(buf, 256, fp);
-    if (strstr(buf, "%%MatrixMarket") != NULL && 
-	strstr(buf, "symmetric") != NULL) {
-      flag_symmetric = 1;
-      if(verbose) {
-	printf("symmetric\n");
-      }
-    }
-    if (buf[0] != '%') {
-      break;
-    }
-  }
-  sscanf(buf, "%d %d %d", &num_row, &num_col, &num_nz);
-
-  col_coo = (int *)malloc(sizeof(int) * num_nz);
-  row_coo = (int *)malloc(sizeof(int) * num_nz);
-  val_coo = (double *)malloc(sizeof(double) * num_nz);
-
-  for (int j = 0; j < num_nz; j++) {
-    fgets(buf, 256, fp);
-    sscanf(buf, "%d %d %lf", &row_coo[j], &col_coo[j], &val_coo[j]);
-    // for C index array style starting at 0
-    row_coo[j]--;
-    col_coo[j]--;
-  }
-  fclose(fp);
-
-  // count diagonal parts
-  num_nz0 = num_nz;
-  if (flag_symmetric) {
-    num_nz = num_nz * 2;
-    int ktmp = 0;
-    for (int i = 0; i < num_nz0; i++) {
-      if (row_coo[i] == col_coo[i]) {
-	ktmp++;
-      }
-    }
-    num_nz -= ktmp;
-  }
-
-  col_ind = (int *)malloc(sizeof(int) * num_nz);
-  col_ind_new = (int *)malloc(sizeof(int) * num_nz);
-  val = (double *)malloc(sizeof(double) * num_nz);
-  val_new = (double *)malloc(sizeof(double) * num_nz);
-  row_ptr = (int *)malloc(sizeof(int) * (num_row + 1));
-  row_ptr_new = (int *)malloc(sizeof(int) * (num_row + 1));
-  nonzeros = (int *)malloc(sizeof(int) * num_row);
-  ordering = (int *)malloc(sizeof(int) * num_row);
-  reordering = (int *)malloc(sizeof(int) * num_row);
-
-  if(verbose) {
-    printf("%d %d %d\n", num_row, num_col, num_nz);
-  }
-
-  for (int i = 0; i < num_row; i++) {
-    nonzeros[i] = 0;
-  }
-  for (int j = 0; j < num_nz0; j++) {
-    nonzeros[row_coo[j]]++;
-    if (flag_symmetric) {
-      if (row_coo[j] != col_coo[j]) {
-	nonzeros[col_coo[j]]++;
-      }
-    }
-  }
-
-  row_ptr[0] = 0;
-  for (int i = 0; i < num_row; i++) {
-    row_ptr[i + 1] = row_ptr[i] + nonzeros[i];
-  }
-
-  for (int i = 0; i < num_row; i++) {
-    reordering[i] = i;
-  }
-
-  padding = count_padding(nonzeros, reordering, num_row, block_size);
-  if(verbose) {
-    printf("original:  %d\n", padding);
-  }
-  // make CSR format
-  for (int i = 0; i < num_row; i++) {
-    nonzeros[i] = 0;
-  }
-  for (int j = 0; j < num_nz0; j++) { 
-    int ii = row_coo[j];
-    int jj = col_coo[j];
-    int ktmp = row_ptr[ii] + nonzeros[ii];
-    col_ind[ktmp] = jj;
-    val[ktmp] = val_coo[j];
-    nonzeros[ii]++;
-    if (flag_symmetric) {
-      if (ii != jj) {
-	ktmp = row_ptr[jj] + nonzeros[jj];
-	col_ind[ktmp] = ii;
-	val[ktmp] = val_coo[j];
-	nonzeros[jj]++;
-      }
-    }
-  }
-
-  max_nonzeros = 0;
-  for (int i = 0; i < num_row; i++) {
-    if (max_nonzeros < nonzeros[i]) {
-      max_nonzeros = nonzeros[i];
-    }
-  }
-  csr_data *work;
-  work = (csr_data *)malloc(max_nonzeros * sizeof(csr_data));
-
-  // sort column index in each row
-  for (int i = 0; i < num_row; i++) {
-    int ktmp = 0;
-    for (int k = row_ptr[i]; k < row_ptr[i + 1]; k++) {
-      work[ktmp].col_ind = col_ind[k];
-      work[ktmp].val     = val[k];
-      ktmp++;
-    }
-    qsort(work, nonzeros[i], sizeof(csr_data), comp_col_ind);
-    ktmp = 0;
-    for (int k = row_ptr[i]; k < row_ptr[i + 1]; k++) {
-      col_ind[k] = work[ktmp].col_ind;
-      val[k] = work[ktmp].val;
-      ktmp++;
-    }
-  }
-  strcpy(buf, in_file);
-  strcat(buf, ".ps");
-
-  if (graph_output) {
-    draw_csr(buf, row_ptr, col_ind, num_row);
-  }
-
-  strcpy(buf, out_file);
-
-  switch(method_ordering) {
-#ifdef AMD_ORDERING
-  case AMD: 
-    {
-      double Control [AMD_CONTROL], Info [AMD_INFO];
-      
-      amd_defaults(Control) ;
-      amd_control(Control) ;
-      (void)amd_order(num_row, row_ptr, col_ind, reordering, Control, Info);
-      // make inverse mapping : old -> new
-      if (verbose) {
-	amd_info(Info);
-      }
-      for (int i = 0; i < num_row; i++) {
-	ordering[reordering[i]] = i;
-      }
-      strcat(buf, ".amd.ps");
-    }
-    break;
-#endif
-  case DESCEND:
-    make_order_index(ordering, row_ptr, num_row, 1);
-    for (int i = 0; i < num_row; i++) {
-      reordering[ordering[i]] = i;
-    }
-    strcat(buf, ".descend.ps");
-    break;
-  }
-
-  // ordering[i] : old -> new, new index with dreasing order of nonzro
-
-  padding = count_padding(nonzeros, reordering, num_row, block_size);
-
-  if(verbose) {
-    switch(method_ordering) {
-    case AMD:
-      printf("amd:       ");
-      break;
-    case DESCEND:
-      printf("descending:");
-      break;
-    }
-    printf("%d\n", padding);
-  }
-  reorder_csr_matrix(val_new, col_ind_new, row_ptr_new, 
-		     val, col_ind, row_ptr, 
-		     ordering, reordering, work, num_row);
-  
-  
-  if((fp = fopen(out_file, "w")) == NULL) {
-    exit(-1);
-  }
-  fprintf(fp, "%%%%MatrixMarket matrix coordinate real general\n");
-  fprintf(fp, "%d %d %d\n", num_row, num_row, row_ptr_new[num_row]);
-  for (int i = 0; i < num_row; i++) {
-    for (int j = row_ptr_new[i]; j < row_ptr_new[i + 1]; j++) {
-      fprintf(fp, "%d %d %g\n", (i + 1), (col_ind_new[j] + 1), val_new[j]);
-    }
-  }
-  fclose(fp);
-
-  if (graph_output) {
-    draw_csr(buf, row_ptr_new, col_ind_new, num_row);
-  }
-
-  min_nonzeros = num_row;
-  mean_nonzeros = 0.0;
-  for (int i = 0; i < num_row; i++) {
-    mean_nonzeros += (double)nonzeros[i];
-    if (min_nonzeros > nonzeros[i]) {
-      min_nonzeros = nonzeros[i];
-    }
-  }
-  mean_nonzeros /= (double)num_row;
-  if (verbose) {
-    printf("max nonzeros = %d mean= %g min = %d\n", 
-	   max_nonzeros, mean_nonzeros, min_nonzeros);
-  }
-}
-
-void print_CSR(char *st, 
-	       double *val, int *col_ind, int *row_ptr, int n)
-{
-  printf("[ %s ]\n", st);
-  for (int i = 0; i < n; i++) {
-    printf("%d : [%d] ", i,  row_ptr[i + 1] -  row_ptr[i]);
-    for (int k = row_ptr[i]; k < row_ptr[i + 1]; k++) {
-      printf(" %g:%d ", val[k], col_ind[k]);
-    }
-    printf("\n");
-  }
-}
-
-
-void make_order_index(int *ordering, int *row_ptr, int n, int descend)
-{
-  int *slices, *slice_offset;
-  int mn;
-  
-  // find maximum nonzeros from all rows
-  mn = 0;
-  for (int i = 0; i < n; i++) {
-    int non_zeros = row_ptr[i + 1] - row_ptr[i];
-    if (mn < non_zeros) {
-      mn = non_zeros;
-    }
-  }
-  // prepare working array : this suppose row without element
-  slices = (int *)malloc(sizeof(int) * (mn + 1));
-  slice_offset = (int *)malloc(sizeof(int) * (mn + 1));
-  for (int i = 0; i <= mn; i++) {
-    slices[i] = 0;
-    slice_offset[i] = 0;
-  }
-  // slices[i] keeps number of indices of rows whos width is i
-  for (int i = 0; i < n; i++) {
-    int non_zeros = row_ptr[i + 1] - row_ptr[i];
-    slices[non_zeros]++;
-  }
-  // making blocks in decreasing order of nonzeros
-  if (descend) {
-    slice_offset[mn] = 0;
-    for (int i = mn - 1; i >= 0; i--) {
-      slice_offset[i] = slice_offset[i + 1] + slices[i + 1];
-    }
-  }
-  else {
-    slice_offset[0] = 0;
-    for (int i = 0; i < mn; i++) {
-      slice_offset[i + 1] = slice_offset[i] + slices[i];
-    }
-  }
-  
-  // this keeps original ordeing wihtin a block
-  for (int i = 0; i < n; i++) {
-    int non_zeros = row_ptr[i + 1] - row_ptr[i];
-    ordering[i] = slice_offset[non_zeros]++;
-  }
-
-  free(slices);
-  free(slice_offset);
-  
-}
- 
-int  comp_int(const void *_a, const void *_b) {
-  // cast to deal with arguments defined as void *
-  int a = *(int *)_a;
-  int b = *(int *)_b;
-
-  if (a < b) {
-    return -1;
-  } else if (a > b) {
-    return 1;
-  }
-  else {
-    return 0;
-  }
-}
-
-
-int  comp_col_ind(const void *_a, const void *_b) {
-  // cast to deal with arguments defined as void *
-  int a = (*(csr_data *)_a).col_ind;
-  int b = (*(csr_data *)_b).col_ind;
-
-  if (a < b) {
-    return -1;
-  } else if (a > b) {
-    return 1;
-  }
-  else {
-    return 0;
-  }
-}
-
-
-void draw_csr(char *buf, int *row_ptr, int *col_ind, int num_row)
-{
-  FILE *fp;
-
-  if((fp = fopen(buf, "w")) == NULL) {
-    exit(-1);
-  }
-  fprintf(fp, "%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: 5 5 395 395\n");
-  fprintf(fp, "/rr { %g } def\n",  0.45 * 380.0 / (double)(num_row + 2));
-  fprintf(fp, "/n { newpath } def\n");
-  fprintf(fp, "/rl { rlineto } def\n");
-  fprintf(fp, "/m { moveto } def\n");
-  fprintf(fp,"n 10 10 m 380 0 rl 0 380 rl -380 0 rl 0 -380 rl closepath 0.85 setgray fill\n");
-  for (int i = 0; i < num_row; i++) {
-    for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
-      fprintf(fp,"n %g %g rr 0 360 arc 0 setgray fill\n", 
-	      10.0  + (double)col_ind[j] / (double)(num_row + 2) * 380.0,
-	      390.0 - (double)i / (double)(num_row + 2) * 380.0);
-    }
-  }
-  fprintf(fp, "showpage\n");
-  fclose(fp);
-}
-
-
-int count_padding(int *nonzeros, int *reordering, int num_row, int block_size)
-{
-  // count artificial zeros
-  int padding = 0;
-
-  for (int k = 0; k < num_row; k += block_size) {
-    int block_max = 0;
-    for (int j = 0; j < block_size; j++) {
-      if (k + j >= num_row) {
-	break;
-      }
-      int kj = reordering[k + j];
-      if (block_max < nonzeros[kj]) {
-	block_max = nonzeros[kj];
-      }
-    }
-    for (int j = 0; j < block_size; j++) {
-      if (k + j >= num_row) {
-	break;
-      }
-      int kj = reordering[k + j];
-      padding += block_max - nonzeros[kj];
-    }
-  }
-
-  return padding;
-}
-
-int countalignedRgCSR(int *row_ptr,  int group_size, int n)
-{
-  int aligned_max;
-
-  aligned_max = 0;
-  // find maximumn number of nonzeros in each group
-  for (int i = 0; i < n; i += group_size) {
-    int ntmp = 0;
-    for (int k = 0; k < group_size; k++) {
-      int ik = i + k;
-      if (ik >= n) {
-	break;
-      }
-      int mtmp = row_ptr[ik + 1] -  row_ptr[ik];
-      if (ntmp < mtmp) {
-	ntmp = mtmp;
-      }
-    }
-    aligned_max += ntmp * group_size;
-  }
-  return aligned_max;
-}
-
-void makeRgCSR(double *val_new, int *col_ind_new, int *nonzeros, int *grp_ptr,
-	       double *val, int *col_ind, int *row_ptr, int group_size, int n)
-{
-  int jtmp;
-
-  jtmp = 0;
-  grp_ptr[0] = 0;
-  for (int i = 0; i < n; i+= group_size) {
-   int current_group = group_size;
-   if (i + group_size > n) {
-     current_group = n % group_size;
-   }
-   int ntmp = 0;
-   for (int k = 0; k < current_group; k++) {
-     int ik = i + k;
-     int mtmp = row_ptr[ik + 1] -  row_ptr[ik];
-      if (ntmp < mtmp) {
-	ntmp = mtmp;
-      }
-   }
-   int ig = i / group_size;
-   if (ig < (n / group_size + (n % group_size != 0) - 1)) {
-    grp_ptr[ig + 1] = grp_ptr[ig] + ntmp * group_size;  
-   }
-   for (int j = 0; j < ntmp; j++) {
-     for (int k = 0; k < current_group; k++) {
-       int ik = i + k;
-       if (j < (row_ptr[ik + 1] - row_ptr[ik])) {
-	 col_ind_new[jtmp] = col_ind[row_ptr[ik] + j];
-	 val_new[jtmp] = val[row_ptr[ik] + j];
-       }
-       else {
-	 col_ind_new[jtmp] = (-1);
-	 val_new[jtmp] = 0.0;
-       }
-       jtmp++;
-     } // loop : k
-   }  // loop : j
-  }
-
-  for (int i = 0; i < n; i++) {
-    nonzeros[i] = row_ptr[i + 1] -  row_ptr[i];
-  }
-}
-
-void SpMVCSR(double *y, double *x, double *val, 
-	     int *col_ind, int *row_ptr, int num_row)
-{
-  double stmp;
-  for (int i = 0; i < num_row; i++) {
-    stmp = 0.0;
-    for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
-      stmp += x[col_ind[j]] * val[j];
-    }
-    y[i] = stmp;
-  }
-}
- 
-void  SpMVRgCSR(double *y, double *x, 
-		double *val, int *col_ind, int *nonzeros, int *grp_ptr, 
-		int block_size, int n)
-{
-  int num_blocks = n / block_size + (n % block_size != 0);
-  for (int j = 0; j < num_blocks; j++) {
-    for (int k = 0; k < block_size; k++) {
-      int irow = j * block_size + k;
-      if (irow >= n) {
-	return;
-      }
-      int ptr = grp_ptr[j] + k;
-      int crnt_grp_size = block_size;
-      if ((j + 1) * block_size > n) {
-	crnt_grp_size = n % block_size;
-      }
-      double stmp = 0.0;
-      for (int i = 0; i < nonzeros[irow]; i++) {
-	stmp += val[ptr] * x[col_ind[ptr]];
-	ptr += crnt_grp_size;
-      }
-      y[irow] = stmp;
-    }
-  }
-}
-
-void reorder_csr_matrix(double *val_new, int *col_ind_new, int *row_ptr_new, 
-			double *val, int *col_ind, int *row_ptr, 
-			int *ordering, int *reordering, csr_data *work, 
-			int num_row)
-{
-  // csr_data *work is allocated as max_j (row_ptr[j + 1] - row_ptr[j]) sized
-  int jtmp = 0;
-  row_ptr_new[0] = 0;
-  for (int i = 0; i < num_row; i++) {
-    int j = reordering[i];
-    int ktmp = 0;
-    for (int k = row_ptr[j]; k < row_ptr[j + 1]; k++) {
-      work[ktmp].col_ind = ordering[col_ind[k]];
-      work[ktmp].val     = val[k];
-      ktmp++;
-    }
-    int itmp = row_ptr[j + 1] - row_ptr[j];
-    qsort(work, itmp, sizeof(csr_data), comp_col_ind);
-    ktmp = 0;
-    for (int k = 0; k < itmp; k++) {
-      val_new[jtmp]     = work[k].val;
-      col_ind_new[jtmp] = work[k].col_ind;
-      jtmp++; 
-    }
-    row_ptr_new[i + 1] = jtmp;
-  }
-}
diff --git a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cpp b/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cpp
deleted file mode 100644
index 6d543d7c93bf817ce83d267d735b6fea732ee231..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
-                          matrix-solvers-benchmark.cpp  -  description
-                             -------------------
-    begin                : Jan 8, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include "matrix-solvers-benchmark.h"
diff --git a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cu b/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cu
deleted file mode 100644
index 5afd986ff0c0224bc51e40b3da4ff18da64ee0eb..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/***************************************************************************
-                          matrix-solvers-benchmark.cu  -  description
-                             -------------------
-    begin                : Oct 20, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include "matrix-solvers-benchmark.h"
diff --git a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h b/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h
deleted file mode 100644
index 1bb3dfd96c4d807af244c5f3424990b9278f1824..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h
+++ /dev/null
@@ -1,405 +0,0 @@
-/***************************************************************************
-                          matrix-solvers-benchmark.h  -  description
-                             -------------------
-    begin                : Jan 8, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef MATRIXSOLVERSBENCHMARK_H_
-#define MATRIXSOLVERSBENCHMARK_H_
-
-#include <fstream>
-#include <TNL/File.h>
-#include <TNL/Object.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/legacy/matrices/tnlRgCSR.h>
-#include <TNL/Solvers/IterativeSolverMonitor.h>
-#include <TNL/Solvers/Linear/stationary/SOR.h>
-#include <TNL/Solvers/Linear/CG.h>
-#include <TNL/Solvers/Linear/BICGStab.h>
-#include <TNL/Solvers/Linear/GMRES.h>
-#include <TNL/Solvers/Linear/TFQMR.h>
-#ifdef HAVE_PETSC
-   #include <petsc.h>
-#endif
-
-#include "tnlConfig.h"
-const char configFile[] = TNL_CONFIG_DIRECTORY "tnl-matrix-solvers-benchmark.cfg.desc";
-
-void writeTestFailToLog( const Config::ParameterContainer& parameters )
-{
-   const String& logFileName = parameters. getParameter< String >( "log-file" );
-   std::fstream logFile;
-   if( logFileName != "" )
-   {
-      logFile. open( logFileName. getString(), std::ios::out | std::ios::app );
-      if( ! logFile )
-         std::cerr << "Unable to open the log file " << logFileName << std::endl;
-      else
-      {
-         String bgColor( "#FF0000" );
-         logFile << "             <td bgcolor=" << bgColor << "> N/A </td> " << std::endl
-                 << "             <td bgcolor=" << bgColor << "> N/A </td> " << std::endl
-                 << "             <td bgcolor=" << bgColor << "> N/A </td> " << std::endl;
-         logFile. close();
-      }
-   }
-}
-
-template< typename Solver, typename Matrix, typename Vector >
-bool benchmarkSolver( const Config::ParameterContainer&  parameters,
-                      Solver& solver,
-                      const Matrix& matrix,
-                      const Vector& b,
-                      Vector& x )
-{
-   typedef typename Matrix :: RealType RealType;
-   typedef typename Matrix :: DeviceType DeviceType;
-   typedef typename Matrix :: IndexType IndexType;
-
-   const RealType& maxResidue = parameters. getParameter< double >( "max-residue" );
-   const IndexType& size = matrix. getRows();
-   const IndexType nonZeros = matrix. getNumberOfMatrixElements();
-   //const IndexType maxIterations = size * ( ( double ) size * size / ( double ) nonZeros );
-   const IndexType maxIterations = size;
-  std::cout << "Setting max. number of iterations to " << maxIterations << std::endl;
-
-   solver. setMatrix( matrix );
-   solver. setMaxIterations( maxIterations );
-   solver. setMaxResidue( maxResidue );
-   solver. setMinResidue( 1.0e9 );
-   IterativeSolverMonitor< RealType, IndexType > solverMonitor;
-   solver. setSolverMonitor( solverMonitor );
-   solver. setRefreshRate( 10 );
-   solverMonitor. resetTimers();
-   solver. solve( b, x );
-
-   bool solverConverged( solver. getResidue() < maxResidue );
-   const String& logFileName = parameters. getParameter< String >( "log-file" );
-   std::fstream logFile;
-   if( logFileName != "" )
-   {
-      logFile. open( logFileName. getString(), std::ios::out | std::ios::app );
-      if( ! logFile )
-         std::cerr << "Unable to open the log file " << logFileName << std::endl;
-      else
-      {
-         String bgColor( "#FF0000" );
-         if( solver. getResidue() < 1 )
-            bgColor="#FF8888";
-         if( solver. getResidue() < maxResidue )
-         {
-            bgColor="#88FF88";
-         }
-         double cpuTime = solverMonitor. getCPUTime();
-         double realTime = solverMonitor. getRealTime();
-         logFile << "             <td bgcolor=" << bgColor << "> " << solver. getResidue() << " </td> " << std::endl
-                 << "             <td bgcolor=" << bgColor << "> " << solver. getIterations() << " </td> " << std::endl
-                 << "             <td bgcolor=" << bgColor << "> " << cpuTime << " </td> " << std::endl;
-         logFile. close();
-      }
-   }
-   return solverConverged;
-
-}
-
-template< typename Matrix, typename Vector >
-bool benchmarkMatrixOnDevice( const Config::ParameterContainer&  parameters,
-                              const Matrix& matrix,
-                              const Vector& b,
-                              Vector& x )
-{
-   typedef typename Matrix :: RealType RealType;
-   typedef typename Matrix :: DeviceType DeviceType;
-   typedef typename Matrix :: IndexType IndexType;
-
-   const String& solverClass = parameters. getParameter< String >( "solver-class" );
-   if( solverClass == "tnl" )
-   {
-      const String& solverName = parameters. getParameter< String >( "solver-name" );
-      IndexType iterations( 0 );
-      RealType residue( 0.0 );
-      bool converged( false );
-      if( solverName == "sor" )
-      {
-         SOR< Matrix > solver;
-         const RealType& sorOmega = parameters. getParameter< double >( "sor-omega" );
-         solver. setOmega( sorOmega );
-         return benchmarkSolver( parameters, solver, matrix, b, x );
-      }
-      if( solverName == "cg" )
-      {
-         CG< Matrix > solver;
-         return benchmarkSolver( parameters, solver, matrix, b, x );
-      }
-      if( solverName == "bicgstab" )
-      {
-         BICGStab< Matrix > solver;
-         return benchmarkSolver( parameters, solver, matrix, b, x );
-      }
-      if( solverName == "gmres" )
-      {
-         GMRES< Matrix > solver;
-         const IndexType& gmresRestarting = parameters. getParameter< int >( "gmres-restarting" );
-         solver. setRestarting( gmresRestarting );
-         return benchmarkSolver( parameters, solver, matrix, b, x );
-      }
-      if( solverName == "tfqmr" )
-      {
-         TFQMR< Matrix > solver;
-         return benchmarkSolver( parameters, solver, matrix, b, x );
-      }
-      std::cerr << "Unknown solver " << solverName << std::endl;
-      return false;
-   }
-   if( solverClass == "petsc" )
-   {
-#ifndef HAVE_PETSC
-      std::cerr << "PETSC is not installed on this system." << std::endl;
-      writeTestFailToLog( parameters );
-      return false;
-#else
-      if( DeviceType :: getDeviceType() != "Devices::Host" )
-      {
-         std::cerr << "PETSC tests can run only on host. The current device is " << DeviceType :: getDeviceType() << std::endl;
-         writeTestFailToLog( parameters );
-         return false;
-      }
-      /****
-       * Set-up the PETSC matrix
-       */
-      const IndexType n = matrix. getSize();
-      Mat A;
-      MatCreate( PETSC_COMM_WORLD, &A );
-      MatSetType( A, MATAIJ );
-      MatSetSizes( A, PETSC_DECIDE, PETSC_DECIDE, n, n );
-      MatSetUp( A );
-
-      /****
-       * Inserting data
-       */
-      Array< PetscScalar > petscVals;
-      Array< PetscInt > petscCols;
-      petscVals. setSize( n );
-      petscCols. setSize( n );
-      for( IndexType i = 0; i < n; i ++ )
-      {
-         const IndexType rowLength = matrix. getRowLength( i );
-         for( IndexType j = 0; j < rowLength; j ++ )
-         {
-            petscVals. setElement( j, matrix. getRowValues( i )[ j ] );
-            petscCols. setElement( j, matrix. getRowColumnIndexes( i )[ j ] );
-         }
-         MatSetValues( A,
-                       1,  // setting one row
-                       &i, // index of thew row
-                       rowLength,
-                       petscCols. getData(),
-                       petscVals. getData(),
-                       INSERT_VALUES );
-      }
-      MatAssemblyBegin( A, MAT_FINAL_ASSEMBLY );
-      MatAssemblyEnd( A, MAT_FINAL_ASSEMBLY );
-
-      /****
-       * Check matrix conversion
-       */
-      /*for( IndexType i = 0; i < n; i++ )
-         for( IndexType j = 0; j < n; j ++ )
-         {
-            PetscScalar value;
-            MatGetValues( A, 1, &i, 1, &j, &value );
-            if( matrix. getElement( i, j ) != value )
-            {
-               std::cerr << "Conversion to PETSC matrix was not correct at position " << i << " " << j << "." << std::endl;
-               std::cerr << "Values are " << value << " and " << matrix. getElement( i, j ) << std::endl;
-               return false;
-            }
-         }
-      std::cerr << "PETSC CONVERSION WAS OK!!!" << std::endl;
-      return true;*/
-
-      Vec petscB, petscX;
-      KSP ksp;
-      KSPCreate( PETSC_COMM_WORLD, &ksp );
-
-
-#endif
-   }
-
-}
-
-
-template< typename Real, typename Index >
-bool benchmarkMatrix( const Config::ParameterContainer&  parameters )
-{
-   /****
-    * Loading the matrix from the input file
-    */
-   typedef CSR< Real, Devices::Host, Index > csrMatrixType;
-   String inputFile = parameters. getParameter< String >( "input-file" );
-   csrMatrixType csrMatrix;
-   if( ! csrMatrix. load( inputFile ) )
-   {
-      std::cerr << "Unable to load file " << inputFile << std::endl;
-      return false;
-   }
-
-   /****
-    * Writing matrix statistics
-    */
-   String matrixStatsFileName = parameters. getParameter< String >( "matrix-stats-file" );
-   if( matrixStatsFileName )
-   {
-      std::fstream matrixStatsFile;
-      matrixStatsFile. open( matrixStatsFileName. getString(), std::ios::out );
-      if( ! matrixStatsFile )
-      {
-         std::cerr << "Unable to open matrix statistics file " << matrixStatsFileName << std::endl;
-         return false;
-      }
-      matrixStatsFile << "             <td> " << csrMatrix. getRows() << " </td> " << std::endl
-                      << "             <td> " << csrMatrix. getNumberOfMatrixElements() << " </td> " << std::endl;
-      matrixStatsFile. close();
-   }
-
-   /****
-    * Setting up the linear problem
-    */
-   const Index size = csrMatrix. getRows();
-  std::cout << "Matrix size is " << size << std::endl;
-   Vector< Real, Devices::Host, Index > x1( "matrix-solvers-benchmark:x1" );
-   Vector< Real, Devices::Host, Index > x( "matrix-solvers-benchmark:x" );
-   Vector< Real, Devices::Host, Index > b( "matrix-solvers-benchmark:b" );
-   if( ! x1. setSize( size ) ||
-       ! x. setSize( size ) ||
-       ! b. setSize( size ) )
-   {
-      std::cerr << "Sorry, I do not have enough memory for the benchmark." << std::endl;
-      return false;
-   }
-   x1. setValue( ( Real ) 1.0 );
-   x. setValue( ( Real ) 0.0 );
-   csrMatrix. vectorProduct( x1, b );
-
-   const String device = parameters. getParameter< String >( "device" );
-   if( device == "host" )
-      if( ! benchmarkMatrixOnDevice( parameters, csrMatrix, b, x ) )
-         return false;
-
-   if( device == "cuda" )
-   {
-#ifdef HAVE_CUDA
-      tnlRgCSR< Real, Devices::Cuda, Index > rgCSR( "matrix-solvers-benchmark:rgCSR" );
-      // FIX THIS
-      //rgCSR = csrMatrix;
-      /*Vector< Real, Devices::Cuda, Index > cudaX( "matrix-solvers-benchmark:cudaX" );
-      Vector< Real, Devices::Cuda, Index > cudaB( "matrix-solvers-benchmark:cudaB" );
-      cudaX. setLike( x );
-      cudaX = x;
-      cudaB. setLike( b );
-      cudaB = b;
-      if( ! benchmarkMatrixOnDevice( parameters, rgCSR, cudaB, cudaX ) )
-         return false;
-      x = cudaX;*/
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
-   }
-
-  std::cout << std::endl << "L1 diff. norm = " << x. differenceLpNorm( x1, ( Real ) 1.0 )
-        << " L2 diff. norm = " << x. differenceLpNorm( x1, ( Real ) 2.0 )
-        << " Max. diff. norm = " << x. differenceMax( x1 ) << std::endl;
-   return true;
-}
-
-int main( int argc, char* argv[] )
-{
-#ifdef HAVE_PETSC
-   PetscInitialize( &argc, &argv, ( char* ) 0, ( char* ) 0 );
-#endif
-   /****
-    * Parsing command line arguments ...
-    */
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-
-   if( conf_desc.parseConfigDescription( configFile ) != 0 )
-      return 1;
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   String inputFile = parameters. getParameter< String >( "input-file" );
-   String str_input_mtx_file = parameters. getParameter< String >( "input-mtx-file" );
-   String log_file_name = parameters. getParameter< String >( "log-file" );
-   double stop_time = parameters. getParameter< double >( "stop-time" );
-   int verbose = parameters. getParameter< int >( "verbose");
-
-   /****
-    * Checking a type of the input data
-    */
-   String objectType;
-   if( ! getObjectType( inputFile, objectType ) )
-   {
-      std::cerr << "Unable to detect object type in " << inputFile << std::endl;
-      return EXIT_FAILURE;
-   }
-   Containers::List< String > parsedObjectType;
-   parseObjectType( objectType,
-                    parsedObjectType );
-   String objectClass = parsedObjectType[ 0 ];
-   if( objectClass != "CSR" )
-   {
-      std::cerr << "I am sorry, I am expecting CSR in the input file but I found " << objectClass << "." << std::endl;
-      return EXIT_FAILURE;
-   }
-
-   String precision = parsedObjectType[ 1 ];
-   //String indexing = parsedObjectType[ 3 ];
-   if( precision == "float" )
-      if( ! benchmarkMatrix< float, int >( parameters ) )
-      {
-#ifdef HAVE_PETSC
-         PetscFinalize();
-#endif
-         return EXIT_FAILURE;
-      }
-
-   if( precision == "double" )
-      if( ! benchmarkMatrix< double, int >( parameters ) )
-      {
-#ifdef HAVE_PETSC
-         PetscFinalize();
-#endif
-         return EXIT_FAILURE;
-      }
-
-   std::fstream log_file;
-   if( log_file_name )
-   {
-      log_file. open( log_file_name. getString(), std::ios::out | std::ios::app );
-      if( ! log_file )
-      {
-         std::cerr << "Unable to open log file " << log_file_name << " for appending logs." << std::endl;
-         return EXIT_FAILURE;
-      }
-     std::cout << "Writing to log file " << log_file_name << "..." << std::endl;
-   }
-#ifdef HAVE_PETSC
-   PetscFinalize();
-#endif
-   return EXIT_SUCCESS;
-
-}
-
-
-#endif /* MATRIXSOLVERSBENCHMARK_H_ */
diff --git a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cpp b/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cpp
deleted file mode 100644
index 0811d90de8825194b92c492ff0f19c61d75a03d6..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/***************************************************************************
-                          sparse-matrix-benchmark.cpp  -  description
-                             -------------------
-    begin                : Jul 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "sparse-matrix-benchmark.h"
diff --git a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cu b/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cu
deleted file mode 100644
index 524aca53bd8ff5d7db254032a90e9dde3dc1edee..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.cu
+++ /dev/null
@@ -1,12 +0,0 @@
-/***************************************************************************
-                          sparse-matrix-benchmark.cu  -  description
-                             -------------------
-    begin                : Jul 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-
-#include "sparse-matrix-benchmark.h"
diff --git a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.h b/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.h
deleted file mode 100644
index 10bc59794c898aa987b35b8b640eec778d21594d..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/sparse-matrix-benchmark.h
+++ /dev/null
@@ -1,416 +0,0 @@
-/***************************************************************************
-                          sparse-matrix-benchmark.h  -  description
-                             -------------------
-    begin                : Jul 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef SPARSEMATRIXBENCHMARK_H_
-#define SPARSEMATRIXBENCHMARK_H_
-
-#include <fstream>
-#include <iomanip>
-#include <TNL/Config/ConfigDescription.h>
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Matrices/Dense.h>
-#include <TNL/Matrices/Ellpack.h>
-#include <TNL/Matrices/SlicedEllpack.h>
-#include <TNL/Matrices/ChunkedEllpack.h>
-#include <TNL/Matrices/CSR.h>
-#include <TNL/Matrices/MatrixReader.h>
-#include <TNL/Math.h>
-#include "tnlSpmvBenchmark.h"
-
-#include "tnlConfig.h"
-const char configFile[] = TNL_CONFIG_DIRECTORY "tnl-sparse-matrix-benchmark.cfg.desc";
-
-
-/*
-double bestCudaRgCSRGflops( 0 );
-
-template< typename Real >
-void benchmarkRgCSRFormat( const CSR< Real, Devices::Host, int >& csrMatrix,
-                           const Vector< Real, Devices::Host >& refX,
-                           const Vector< Real, Devices::Cuda >& cudaX,
-                           Vector< Real, Devices::Host >& refB,
-                           bool formatTest,
-                           const int maxIterations,
-                           const bool useAdaptiveGroupSize,
-                           const tnlAdaptiveGroupSizeStrategy adaptiveGroupSizeStrategy,
-                           const tnlSpmvBenchmarkCSR< Real, int >& csrMatrixBenchmark,
-                           bool verbose,
-                           const String& inputMtxFile,
-                           const String& logFileName,
-                           std::fstream& logFile )
-{
-   tnlSpmvBenchmarkRgCSR< Real, Devices::Host, int > hostRgCsrMatrixBenchmark;
-   for( int groupSize = 16; groupSize <= 64; groupSize *= 2 )
-   {
-
-      hostRgCsrMatrixBenchmark. setGroupSize( groupSize );
-      hostRgCsrMatrixBenchmark. setUseAdaptiveGroupSize( useAdaptiveGroupSize );
-      hostRgCsrMatrixBenchmark. setAdaptiveGroupSizeStrategy( adaptiveGroupSizeStrategy );
-      hostRgCsrMatrixBenchmark. setup( csrMatrix );
-      if( formatTest )
-         hostRgCsrMatrixBenchmark. testMatrix( csrMatrix, verbose );
-      hostRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-      //hostRgCsrMatrixBenchmark. runBenchmark( refX, refB, verbose );
-      hostRgCsrMatrixBenchmark. tearDown();
-
-      if( logFileName )
-         hostRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                    csrMatrixBenchmark. getGflops(),
-                                                    inputMtxFile,
-                                                    csrMatrix,
-                                                    true );
-
-      tnlSpmvBenchmarkRgCSR< Real, Devices::Cuda, int > cudaRgCsrMatrixBenchmark;
-      cudaRgCsrMatrixBenchmark. setGroupSize( groupSize );
-      cudaRgCsrMatrixBenchmark. setup( csrMatrix );
-      cudaRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-      for( int cudaBlockSize = 32; cudaBlockSize <= 256; cudaBlockSize *= 2 )
-      {
-         cudaRgCsrMatrixBenchmark. setCudaBlockSize( cudaBlockSize );
-         if( formatTest )
-            cudaRgCsrMatrixBenchmark. testMatrix( csrMatrix, verbose );
-         cudaRgCsrMatrixBenchmark. runBenchmark( cudaX, refB, verbose );
-         if( logFileName )
-            cudaRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                       csrMatrixBenchmark. getGflops(),
-                                                       inputMtxFile,
-                                                       csrMatrix,
-                                                       false );
-         bestCudaRgCSRGflops = max( bestCudaRgCSRGflops, cudaRgCsrMatrixBenchmark. getGflops() );
-      }
-      cudaRgCsrMatrixBenchmark. tearDown();
-   }
-}
-*/
-
-template< typename RealType >
-bool benchmarkMatrix( const Config::ParameterContainer& parameters )
-{
-   /****
-    * Read the CSR matrix ...
-    */
-   typedef CSR< RealType, Devices::Host, int > CsrMatrix;
-   CsrMatrix csrMatrix;
-
-   const String& inputFileName = parameters.getParameter< String >( "input-file" );
-   const String& inputMtxFileName = parameters.getParameter< String >( "input-mtx-file" );
-   const String& logFileName = parameters.getParameter< String >( "log-file" );
-   const String& pdfFileName = parameters.getParameter< String >( "pdf-file" );
-   bool verbose = parameters.getParameter< bool >( "verbose" );
-   const int maxIterations = parameters.getParameter< int >( "max-iterations" );
-
-   std::fstream inputFile;
-   inputFile.open( inputMtxFileName.getString(), std::ios::in );
-   if( ! inputFile )
-   {
-      std::cerr << "I am not able to open the file " << inputMtxFileName << "." << std::endl;
-      return false;
-   }
-   if( ! MatrixReader< CsrMatrix >::readMtxFile( inputFile, csrMatrix ) )
-      return false;
-
-   /****
-    * Check the number of the non-zero elements
-    */
-   const long int nonzeroElements = csrMatrix. getNumberOfNonzeroMatrixElements();
-   if( verbose )
-     std::cout << "Matrix rows: " << csrMatrix.getRows()
-           << " Matrix columns: " << csrMatrix.getColumns()
-           << " Non-zero elements: " << nonzeroElements << std::endl;
-
-   const long int rows = csrMatrix.getRows();
-   const long int columns = csrMatrix.getColumns();
-   Vector< RealType, Devices::Host > refX( "ref-x", columns ), refB( "ref-b", rows );
-   Vector< RealType, Devices::Cuda > cudaX( "cudaX", columns );
-   refX. setValue( 0.0 );
-   for( int i = 0; i < columns; i ++ )
-      refX[ i ] = 1.0; //( Real ) i * 1.0 / ( Real ) size;
-   cudaX = refX;
-   csrMatrix. vectorProduct( refX, refB );
-
-   /****
-    * CSR format benchmark
-    */
-   tnlSpmvBenchmark< CSR< RealType, Devices::Host, int > > csrMatrixBenchmark;
-
-   /****
-    * Use the first instance of tnlSpmvBenchmark which we have
-    * to write the progress-table header.
-    */
-   if( verbose )
-      csrMatrixBenchmark. writeProgressTableHeader();
-
-   csrMatrixBenchmark. setup( csrMatrix );
-   csrMatrixBenchmark. setMaxIterations( maxIterations );
-   csrMatrixBenchmark. runBenchmark( refX, refB, verbose );
-   csrMatrixBenchmark. tearDown();
-
-   /****
-    * Open and write one line to the log file
-    */
-   std::fstream logFile;
-   if( logFileName )
-   {
-      logFile. open( logFileName. getString(), std::ios::out | std::ios::app );
-      if( ! logFile )
-      {
-         std::cerr << "Unable to open log file " << logFileName << " for appending logs." << std::endl;
-         return false;
-      }
-      /****
-       * Open new line of the table and write basic matrix information
-       */
-      long int allElements = csrMatrix. getRows() * csrMatrix. getColumns();
-      logFile << "          <tr>" << std::endl;
-      logFile << "             <td> <a href=\"" << pdfFileName << "\">" << inputFile << "</a> </td>" << std::endl;
-      logFile << "             <td> " << csrMatrix. getRows() << "</td>" << std::endl;
-      logFile << "             <td> " << nonzeroElements << "</td>" << std::endl;
-      logFile << "             <td> " << ( double ) nonzeroElements / allElements * 100.0 << "</td>" << std::endl;
-      csrMatrixBenchmark. writeToLogTable( logFile,
-                                           csrMatrixBenchmark. getGflops(),
-                                           inputMtxFileName,
-                                           csrMatrix,
-                                           false );
-   }
-
-#ifdef UNDEF
-   /****
-    * Cusparse CSR format benchmark
-    */
-   tnlSpmvBenchmarkCusparseCSR< Real, int > cusparseCSRBenchmark;
-   cusparseCSRBenchmark. setup( csrMatrix );
-   cusparseCSRBenchmark. setMaxIterations( maxIterations );
-   cusparseCSRBenchmark. runBenchmark( cudaX, refB, verbose );
-   cusparseCSRBenchmark. tearDown();
-
-   if( logFileName )
-       cusparseCSRBenchmark. writeToLogTable( logFile,
-                                                    csrMatrixBenchmark. getGflops(),
-                                                    inputMtxFile,
-                                                    csrMatrix,
-                                                    true );
-
-   /****
-    * Hybrid format benchmark
-    */
-   tnlSpmvBenchmarkHybridMatrix< Real, int > hybridMatrixBenchmark;
-   hybridMatrixBenchmark. setFileName( inputMtxFile );
-   hybridMatrixBenchmark. setup( csrMatrix );
-   hybridMatrixBenchmark. setMaxIterations( maxIterations );
-   hybridMatrixBenchmark. setNonzeroElements( nonzeroElements );
-   hybridMatrixBenchmark. runBenchmark( refX, refB, verbose );
-   hybridMatrixBenchmark. tearDown();
-
-   if( logFileName )
-   {
-      hybridMatrixBenchmark. writeToLogTable( logFile,
-                                              csrMatrixBenchmark. getGflops(),
-                                              inputMtxFile,
-                                              csrMatrix,
-                                              false );
-   }
-
-   /****
-    * Row-Grouped CSR format
-    */
-   bestCudaRgCSRGflops = 0.0;
-   benchmarkRgCSRFormat( csrMatrix,
-                         refX,
-                         cudaX,
-                         refB,
-                         formatTest,
-                         maxIterations,
-                         false,
-                         tnlAdaptiveGroupSizeStrategyByAverageRowSize,
-                         csrMatrixBenchmark,
-                         verbose,
-                         inputMtxFile,
-                         logFileName,
-                         logFile );
-
-   tnlSpmvBenchmarkRgCSR< Real, Devices::Host, int > hostRgCsrMatrixBenchmark;
-   hostRgCsrMatrixBenchmark. setGroupSize( 16 );
-   hostRgCsrMatrixBenchmark. setUseAdaptiveGroupSize( true );
-   hostRgCsrMatrixBenchmark. setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByAverageRowSize );
-   hostRgCsrMatrixBenchmark. setup( csrMatrix );
-   if( formatTest )
-      hostRgCsrMatrixBenchmark. testMatrix( csrMatrix, verbose );
-   hostRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-   //hostRgCsrMatrixBenchmark. runBenchmark( refX, refB, verbose );
-   hostRgCsrMatrixBenchmark. tearDown();
-   if( logFileName )
-      hostRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                 csrMatrixBenchmark. getGflops(),
-                                                 inputMtxFile,
-                                                 csrMatrix,
-                                                 true );
-   tnlSpmvBenchmarkRgCSR< Real, Devices::Cuda, int > cudaRgCsrMatrixBenchmark;
-   for( int cudaBlockSize = 32; cudaBlockSize <= 256; cudaBlockSize *= 2 )
-   {
-      cudaRgCsrMatrixBenchmark. setCudaBlockSize( cudaBlockSize );
-      cudaRgCsrMatrixBenchmark. setGroupSize( 16 );
-      cudaRgCsrMatrixBenchmark. setUseAdaptiveGroupSize( true );
-      cudaRgCsrMatrixBenchmark. setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByAverageRowSize );
-      cudaRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-      cudaRgCsrMatrixBenchmark. setup( csrMatrix );
-      if( formatTest )
-         cudaRgCsrMatrixBenchmark. testMatrix( csrMatrix, verbose );
-      cudaRgCsrMatrixBenchmark. runBenchmark( cudaX, refB, verbose );
-      if( logFileName )
-         cudaRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                    csrMatrixBenchmark. getGflops(),
-                                                    inputMtxFile,
-                                                    csrMatrix,
-                                                    false );
-   }
-   cudaRgCsrMatrixBenchmark. tearDown();
-
-   /****
-    * Row-Grouped CSR format with reordered rows
-    * The rows are now sorted decreasingly by the number of the nonzero elements
-    */
-   if( verbose )
-     std::cout << "          ------------------------------- Test with sorted matrix ----------------------------------          " << std::endl;
-
-   Vector< int, Devices::Host > rowPermutation( "rowPermutation" );
-   {
-      CSR< Real, Devices::Host > orderedCsrMatrix( "orderedCsrMatrix" );
-      csrMatrix. sortRowsDecreasingly( rowPermutation );
-
-      /****
-       * Check if the ordering is OK.
-       */
-      int rowSize = csrMatrix. getNonzeroElementsInRow( rowPermutation[ 0 ] );
-      for( int i = 1; i < csrMatrix. getSize(); i ++ )
-      {
-         if( rowSize < csrMatrix. getNonzeroElementsInRow( rowPermutation[ i ] ) )
-         {
-            std::cerr << "The rows are not sorted properly. Error is at row number " << i << std::endl;
-         }
-         rowSize = csrMatrix. getNonzeroElementsInRow( rowPermutation[ i ] );
-      }
-      orderedCsrMatrix. reorderRows( rowPermutation, csrMatrix );
-      orderedCsrMatrix. vectorProduct( refX, refB );
-      benchmarkRgCSRFormat( orderedCsrMatrix,
-                            refX,
-                            cudaX,
-                            refB,
-                            formatTest,
-                            maxIterations,
-                            false,
-                            tnlAdaptiveGroupSizeStrategyByAverageRowSize,
-                            csrMatrixBenchmark,
-                            verbose,
-                            inputMtxSortedFile,
-                            logFileName,
-                            logFile );
-
-      tnlSpmvBenchmarkRgCSR< Real, Devices::Host, int > hostRgCsrMatrixBenchmark;
-      hostRgCsrMatrixBenchmark. setGroupSize( 16 );
-      hostRgCsrMatrixBenchmark. setUseAdaptiveGroupSize( true ); // TODO: fix with true - not implemented yet
-      hostRgCsrMatrixBenchmark. setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByFirstGroup );
-      hostRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-      hostRgCsrMatrixBenchmark. setup( orderedCsrMatrix );
-      if( formatTest )
-         hostRgCsrMatrixBenchmark. testMatrix( orderedCsrMatrix, verbose );
-      //hostRgCsrMatrixBenchmark. runBenchmark( refX, refB, verbose );
-      hostRgCsrMatrixBenchmark. tearDown();
-      if( logFileName )
-         hostRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                    csrMatrixBenchmark. getGflops(),
-                                                    inputMtxSortedFile,
-                                                    csrMatrix,
-                                                    true );
-      for( int cudaBlockSize = 32; cudaBlockSize <= 256; cudaBlockSize *= 2 )
-      {
-         tnlSpmvBenchmarkRgCSR< Real, Devices::Cuda, int > cudaRgCsrMatrixBenchmark;
-         cudaRgCsrMatrixBenchmark. setCudaBlockSize( cudaBlockSize );
-         cudaRgCsrMatrixBenchmark. setGroupSize( 16 );
-         cudaRgCsrMatrixBenchmark. setUseAdaptiveGroupSize( true );
-         cudaRgCsrMatrixBenchmark. setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByFirstGroup );
-         cudaRgCsrMatrixBenchmark. setup( orderedCsrMatrix );
-         cudaRgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-
-         if( formatTest )
-            cudaRgCsrMatrixBenchmark. testMatrix( orderedCsrMatrix, verbose );
-         cudaRgCsrMatrixBenchmark. runBenchmark( cudaX, refB, verbose );
-         if( logFileName )
-            cudaRgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                       csrMatrixBenchmark. getGflops(),
-                                                       inputMtxSortedFile,
-                                                       csrMatrix,
-                                                       false );
-      }
-      cudaRgCsrMatrixBenchmark. tearDown();
-   }
-   csrMatrix. vectorProduct( refX, refB );
-
-   /****
-    * Adaptive Row-Grouped CSR format
-    */
-
-   for( int desiredChunkSize = 1; desiredChunkSize <= 32; desiredChunkSize *= 2 )
-   {
-      tnlSpmvBenchmarkAdaptiveRgCSR< Real, Devices::Cuda, int > cudaArgCsrMatrixBenchmark;
-      cudaArgCsrMatrixBenchmark. setDesiredChunkSize( desiredChunkSize );
-      for( int cudaBlockSize = 32; cudaBlockSize <= 256; cudaBlockSize *= 2 )
-      {
-         cudaArgCsrMatrixBenchmark. setCudaBlockSize( cudaBlockSize );
-         cudaArgCsrMatrixBenchmark. setup( csrMatrix );
-         if( formatTest )
-            cudaArgCsrMatrixBenchmark. testMatrix( csrMatrix, verbose );
-         cudaArgCsrMatrixBenchmark. setMaxIterations( maxIterations );
-         cudaArgCsrMatrixBenchmark. runBenchmark( cudaX, refB, verbose );
-         cudaArgCsrMatrixBenchmark. setBestRgCSRGflops( bestCudaRgCSRGflops );
-         if( logFileName )
-            cudaArgCsrMatrixBenchmark. writeToLogTable( logFile,
-                                                        csrMatrixBenchmark. getGflops(),
-                                                        inputMtxFile,
-                                                        csrMatrix,
-                                                        true );
-      }
-      cudaRgCsrMatrixBenchmark. tearDown();
-   }
-
-#endif
-
-
-   if( logFileName )
-   {
-      logFile << "          </tr>" << std::endl;
-      logFile. close();
-   }
-   return true;
-
-}
-
-int main( int argc, char* argv[] )
-{
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
-
-   if( conf_desc.parseConfigDescription( configFile ) != 0 )
-      return 1;
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-   const String& precision = parameters.getParameter< String >( "precision" );
-   if( precision == "float" )
-      if( ! benchmarkMatrix< float >( parameters ) )
-         return EXIT_FAILURE;
-   if( precision == "double" )
-      if( ! benchmarkMatrix< double >( parameters ) )
-         return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-#endif /* SPARSEMATRIXBENCHMARK_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmark.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmark.h
deleted file mode 100644
index 70f3d1fc3a234858f39c3106040f5bace3aca6f0..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmark.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmark.h  -  description
-                             -------------------
-    begin                : Dec 29, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARK_H_
-#define TNLSPMVBENCHMARK_H_
-
-#include "tnlSpmvBenchmarkBase.h"
-#include <TNL/Matrices/CSR.h>
-
-
-template< typename Matrix >
-class tnlSpmvBenchmark
-{
-};
-
-template< typename Real, typename Device, typename Index >
-class tnlSpmvBenchmark< CSR< Real, Device, Index > > : public tnlSpmvBenchmarkBase< CSR< Real, Device, Index > >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   bool setup( const CSR< RealType, Devices::Host, IndexType >& matrix );
-
-   void tearDown();
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                                    const double& csrGflops,
-                                    const String& inputMtxFile,
-                                    const CSR< RealType, Devices::Host, IndexType >& csrMatrix,
-                                    bool writeMatrixInfo  ) const;
-};
-
-#include "tnlSpmvBenchmark_impl.h"
-
-#endif /* TNLSPMVBENCHMARK_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
deleted file mode 100644
index 35a5b388c9b317d3fbf4292337814b76d6d7122b..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmarkAdaptiveRgCSR.h  -  description
-                             -------------------
-    begin                : May 15, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKADAPTIVERGCSRMATRIX_H_
-#define TNLSPMVBENCHMARKADAPTIVERGCSRMATRIX_H_
-
-#include "tnlSpmvBenchmark.h"
-
-#include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-
-template< typename Real, typename Device, typename Index>
-class tnlSpmvBenchmarkAdaptiveRgCSR : public tnlSpmvBenchmark< Real, Device, Index, tnlAdaptiveRgCSR >
-{
-   public:
-
-   tnlSpmvBenchmarkAdaptiveRgCSR();
-
-   bool setup( const CSR< Real, Devices::Host, Index >& matrix );
-
-   void tearDown();
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                         const double& csrGflops,
-                         const String& inputMtxFile,
-                         const CSR< Real, Devices::Host, Index >& csrMatrix,
-                         bool writeMatrixInfo  ) const;
-
-   void setDesiredChunkSize( const Index desiredChunkSize );
-
-   void setCudaBlockSize( const Index cudaBlockSize );
-
-   Index getArtificialZeroElements() const;
-
-   void setBestRgCSRGflops( const double& bestRgCSRGflops );
-
-   protected:
-
-   /****
-    * This is helper method for generating HTML table with benchmark results
-    */
-    String getBgColorByRgCSRSpeedUp( const double& speedUp ) const;
-
-   Index desiredChunkSize;
-
-   Index cudaBlockSize;
-
-   bool useAdaptiveGroupSize;
-
-   tnlAdaptiveGroupSizeStrategy adaptiveGroupSizeStrategy;
-
-   double bestRgCSRGflops;
-};
-
-template< typename Real,
-          typename Device,
-          typename Index>
-tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: tnlSpmvBenchmarkAdaptiveRgCSR()
- : desiredChunkSize( 4 ),
-   cudaBlockSize( 32 ),
-   useAdaptiveGroupSize( false ),
-   adaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByAverageRowSize ),
-   bestRgCSRGflops( 0.0 )
-
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index>
-bool tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setup( const CSR< Real, Devices::Host, Index >& matrix )
-{
-   //TNL_ASSERT( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
-   if( Device :: getDevice() == Devices::HostDevice )
-   {
-      this->matrix. tuneFormat( desiredChunkSize, cudaBlockSize );
-      if( ! this->matrix. copyFrom( matrix ) )
-         return false;
-      //matrix. printOut(std::cout, "text", 30 );
-      //this->matrix. printOut(std::cout, "text", 30 );
-   }
-   if( Device :: getDevice() == Devices::CudaDevice )
-   {
-#ifdef HAVE_CUDA
-      tnlAdaptiveRgCSR< Real, Devices::Host, Index > hostMatrix( "tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setup : hostMatrix" );
-      hostMatrix. tuneFormat( desiredChunkSize, cudaBlockSize );
-      hostMatrix. copyFrom( matrix );
-      if( ! this->matrix. copyFrom( hostMatrix ) )
-         return false;
-#else
-      return false;
-#endif
-   }
-   this->setupOk = true;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index>
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: tearDown()
-{
-   //this->matrix. setSize( 0 );
-   //this->matrix. setNonzeroElements( 0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: writeProgress() const
-{
-  std::cout << left << std::setw( this->formatColumnWidth - 15 ) << "Adap. Row-grouped CSR ";
-   if( Device :: getDevice() == Devices::CudaDevice )
-     std::cout << std::setw( 5 ) << this->desiredChunkSize
-           << std::setw( 10 ) << this->cudaBlockSize;
-   else
-     std::cout << std::setw( 15 ) << this->desiredChunkSize;
-  std::cout << right << std::setw( this->timeColumnWidth ) << std::setprecision( 2 ) << this->getTime()
-        << right << std::setw( this->iterationsColumnWidth ) << this->getIterations()
-        << right << std::setw( this->gflopsColumnWidth ) << std::setprecision( 2 ) << this->getGflops();
-   if( this->getBenchmarkWasSuccesful() )
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << " OK - maxError is " << this->maxError << ". ";
-   else
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "  FAILED";
-#ifndef HAVE_CUDA
-   if( Device :: getDevice() == Devices::CudaDevice )
-      throw Exceptions::CudaSupportMissing();
-#endif
-     std::cout << std::endl;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: writeToLogTable( std::ostream& logFile,
-                                                                                    const double& csrGflops,
-                                                                                    const String& inputMtxFile,
-                                                                                    const CSR< Real, Devices::Host, Index >& csrMatrix,
-                                                                                    bool writeMatrixInfo  ) const
-{
-   if( this->getBenchmarkWasSuccesful() )
-   {
-      String bgColor="#FFFFFF";
-      double speedUp = this->getGflops() / csrGflops;
-      double rgCsrSpeedUp( 0.0 );
-      if( this->bestRgCSRGflops )
-         rgCsrSpeedUp = this->getGflops() / this->bestRgCSRGflops;
-      switch( desiredChunkSize )
-      {
-         case 1: bgColor = "#666666"; break;
-         case 2: bgColor = "#777777"; break;
-         case 4: bgColor = "#888888"; break;
-         case 8: bgColor = "#999999"; break;
-         case 16: bgColor = "#AAAAAA"; break;
-         case 32: bgColor = "#BBBBBB"; break;
-         default: bgColor = "#FFFFFF";
-      }
-      if( writeMatrixInfo )
-      {
-         String baseFileName( inputMtxFile );
-         baseFileName += String( ".argcsr-");
-         baseFileName += String( desiredChunkSize );
-         baseFileName += String( "-" );
-         baseFileName += String( cudaBlockSize );
-         String matrixPdfFile = baseFileName + String( ".pdf" );
-         String matrixHtmlFile = baseFileName + String( ".html" );
-         tnlAdaptiveRgCSR< Real > argCsrMatrix( inputMtxFile );
-         argCsrMatrix. tuneFormat( this->desiredChunkSize,
-                                 this->cudaBlockSize );
-         argCsrMatrix. copyFrom( csrMatrix );
-         this->printMatrixInHtml( matrixHtmlFile, argCsrMatrix );
-         if( rgCsrSpeedUp > 1.0 )
-            bgColor=getBgColorByRgCSRSpeedUp( rgCsrSpeedUp );
-         logFile << "             <td bgcolor=" << bgColor << "> <a href=\"" << matrixPdfFile << "\">PDF</a>, <a href=\"" << matrixHtmlFile << "\">HTML</a></td> " << std::endl;
-         logFile << "             <td bgcolor=" << bgColor << "> " << this->getArtificialZeroElements() << "</td>" << std::endl;
-      }
-
-      bgColor = this->getBgColorBySpeedUp( speedUp );
-      String textColor = "#000000"; //getBgColorByRgCSRSpeedUp( rgCsrSpeedUp );
-      logFile << "             <td bgcolor=" << bgColor << "><font size=3 color=\"" << textColor << "\"> " << this->getTime() << "</font></td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << "><font size=3 color=\"" << textColor << "\"> " << this->getGflops() << "</font></td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << "><font size=3 color=\"" << textColor << "\"> " << speedUp << "</font></td>" << std::endl;
-
-   }
-   else
-   {
-      if( writeMatrixInfo )
-      {
-         logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-         logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      }
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setDesiredChunkSize( const Index desiredChunkSize )
-{
-   this->desiredChunkSize = desiredChunkSize;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setCudaBlockSize( const Index cudaBlockSize )
-{
-   this->cudaBlockSize = cudaBlockSize;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: getArtificialZeroElements() const
-{
-   return this->matrix. getArtificialZeroElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setBestRgCSRGflops( const double& bestRgCSRGflops )
-{
-   this->bestRgCSRGflops = bestRgCSRGflops;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-String tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: getBgColorByRgCSRSpeedUp( const double& speedUp ) const
-{
-   if( speedUp >= 30.0 )
-      return String( "#009900" );
-   if( speedUp >= 25.0 )
-      return String( "#00AA00" );
-   if( speedUp >= 20.0 )
-      return String( "#00BB00" );
-   if( speedUp >= 15.0 )
-      return String( "#00CC00" );
-   if( speedUp >= 10.0 )
-      return String( "#00DD00" );
-   if( speedUp >= 5.0 )
-      return String( "#00EE00" );
-   if( speedUp >= 1.0 )
-      return String( "#00FF00" );
-   return String( "#FFFFFF" );
-}
-
-#endif /* TNLSPMVBENCHMARKADAPTIVERGCSRMATRIX_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase.h
deleted file mode 100644
index 4e5e58078b8929f089e78022c54de6b880a49a8b..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmarkBase.h  -  description
-                             -------------------
-    begin                : May 15, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKBASE_H_
-#define TNLSPMVBENCHMARKBASE_H_
-
-#include <TNL/Matrices/CSR.h>
-#include <TNL/TimerRT.h>
-#include <TNL/Math.h>
-
-
-double tnlSpmvBenchmarkPrecision( const double& ) { return 1.0e-12; }
-float tnlSpmvBenchmarkPrecision( const float& ) { return 1.0e-4; }
-
-template< typename Matrix >
-class tnlSpmvBenchmarkBase
-{
-   public:
-
-   tnlSpmvBenchmarkBase();
- 
-   typedef typename Matrix::RealType RealType;
-   typedef typename Matrix::DeviceType DeviceType;
-   typedef typename Matrix::IndexType IndexType;
-
-   bool getBenchmarkWasSuccesful() const;
-
-   double getGflops() const;
-
-   double getTime() const;
-
-   void setMaxIterations( const int maxIterations );
-
-   int getIterations() const;
-
-   IndexType getArtificialZeros() const;
-
-   RealType getMaxError() const;
-
-   void writeProgressTableHeader();
-
-   virtual bool setup( const CSR< RealType, Devices::Host, IndexType >& matrix ) = 0;
-
-   virtual void tearDown() = 0;
-
-   virtual void writeProgress() const = 0;
-
-   /****
-    * This is virtual only the purpose of testing external formats like
-    * the Hybrid format from the CUSP library. This format is not wrapped
-    * in Matrix.
-    */
-   virtual void runBenchmark( const Vector< RealType, DeviceType, IndexType >& x,
-                              const Vector< RealType, Devices::Host, IndexType >& refB,
-                              bool verbose );
-
-   virtual void writeToLogTable( std::ostream& logFile,
-                                 const double& csrGflops,
-                                 const String& inputMtxFile,
-                                 const CSR< RealType, Devices::Host, IndexType >& csrMatrix,
-                                 bool writeMatrixInfo  ) const = 0;
-
-   protected:
-
-   /****
-    * This is helper method for generating HTML table with benchmark results
-    */
-   String getBgColorBySpeedUp( const double& speedUp ) const;
-
-   /****
-    * Helper method for writing matrix statistics and information to HTML
-    */
-   bool printMatrixInHtml( const String& fileName,
-                           Matrix< RealType, Devices::Host, IndexType >& matrix ) const;
-
-
-   bool benchmarkWasSuccesful;
-
-   bool setupOk;
-
-   double gflops;
-
-   double time;
-
-   /****
-    * Max number of SpMV repetitions.
-    */
-   int maxIterations;
-
-   /****
-    * Real number of repetitions.
-    */
-   int iterations;
-
-   IndexType artificialZeros;
-
-   RealType maxError;
-
-   IndexType firstErrorOccurence;
-
-   Matrix matrix;
-
-   /****
-    * Parameters for the progress table columns
-    */
-
-   int formatColumnWidth;
-
-   int timeColumnWidth;
-
-   int iterationsColumnWidth;
-
-   int gflopsColumnWidth;
-
-   int benchmarkStatusColumnWidth;
-
-   int infoColumnWidth;
-};
-
-
-#include "tnlSpmvBenchmarkBase_impl.h"
-#endif /* TNLSPMVBENCHMARKBASE_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
deleted file mode 100644
index 117fdd89b11cecb3ed5376c8ee53301b8d36288d..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************
-                          tnlSpmBenchmarkBase_impl.h  -  description
-                             -------------------
-    begin                : Dec 29, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKBASE_IMPL_H_
-#define TNLSPMVBENCHMARKBASE_IMPL_H_
-
-template< typename  Matrix >
-tnlSpmvBenchmarkBase< Matrix >::tnlSpmvBenchmarkBase()
-   : benchmarkWasSuccesful( false ),
-     setupOk( false ),
-     gflops( 0.0 ),
-     time( 0.0 ),
-     maxIterations( 0 ),
-     iterations( 0.0 ),
-     artificialZeros( 0 ),
-     maxError( 0.0 ),
-     firstErrorOccurence( 0 ),
-     formatColumnWidth( 40 ),
-     timeColumnWidth( 12 ),
-     iterationsColumnWidth( 15 ),
-     gflopsColumnWidth( 12 ),
-     benchmarkStatusColumnWidth( 12 ),
-     infoColumnWidth( 20 )
-{
-}
-
-template< typename  Matrix >
-bool tnlSpmvBenchmarkBase< Matrix >::getBenchmarkWasSuccesful() const
-{
-   return this->benchmarkWasSuccesful;
-}
-
-template< typename Matrix >
-double tnlSpmvBenchmarkBase< Matrix >::getGflops() const
-{
-   return this->gflops;
-}
-
-template< typename Matrix >
-double tnlSpmvBenchmarkBase< Matrix >::getTime() const
-{
-   return this->time;
-}
-
-template< typename Matrix >
-void tnlSpmvBenchmarkBase< Matrix >::setMaxIterations( const int maxIterations )
-{
-   this->maxIterations = maxIterations;
-}
-
-template< typename Matrix >
-int tnlSpmvBenchmarkBase< Matrix >::getIterations() const
-{
-   return this->iterations;
-}
-
-
-template< typename Matrix >
-typename Matrix::IndexType tnlSpmvBenchmarkBase< Matrix >::getArtificialZeros() const
-{
-   return this->artificialZeros;
-}
-
-template< typename Matrix >
-typename Matrix::RealType tnlSpmvBenchmarkBase< Matrix >::getMaxError() const
-{
-   return this->maxError;
-}
-
-template< typename Matrix >
-void tnlSpmvBenchmarkBase< Matrix >::runBenchmark( const Vector< RealType, DeviceType, IndexType >& x,
-                                                   const Vector< RealType, Devices::Host, IndexType >& refB,
-                                                   bool verbose )
-{
-   benchmarkWasSuccesful = false;
-   if( ! setupOk )
-      return;
-#ifndef HAVE_CUDA
-   if( DeviceType::getDevice() == Devices::CudaDevice )
-   {
-      if( verbose )
-         writeProgress();
-      return;
-   }
-#endif
-
-   Vector< RealType, DeviceType, IndexType > b( "tnlSpmvBenchmark< Real, Device, Index, Matrix > :: runBenchmark : b" );
-   if( ! b. setSize( refB. getSize() ) )
-      return;
-
-   iterations = 0;
-
-   TimerRT rt_timer;
-   rt_timer. Reset();
-   //maxIterations = 1;
-
-   for( int i = 0; i < maxIterations; i ++ )
-   {
-      matrix. vectorProduct( x, b );
-      iterations ++;
-   }
-
-   this->time = rt_timer. getTime();
-
-   firstErrorOccurence = 0;
-   Vector< RealType, Devices::Host, IndexType > resB( "tnlSpmvBenchmark< Real, Device, Index, Matrix > :: runBenchmark : b" );
-   if( ! resB. setSize( b. getSize() ) )
-   {
-      std::cerr << "I am not able to allocate copy of vector b on the host." << std::endl;
-      return;
-   }
-   resB = b;
-   benchmarkWasSuccesful = true;
-   for( IndexType j = 0; j < refB. getSize(); j ++ )
-   {
-      //f << refB[ j ] << " - " << host_b[ j ] << " = "  << refB[ j ] - host_b[ j ] <<  std::endl;
-      RealType error( 0.0 );
-      if( refB[ j ] != 0.0 )
-         error = ( RealType ) fabs( refB[ j ] - resB[ j ] ) /  ( RealType ) fabs( refB[ j ] );
-      else
-         error = ( RealType ) fabs( refB[ j ] );
-      if( error > maxError )
-         firstErrorOccurence = j;
-      this->maxError = max( this->maxError, error );
-
-      /*if( error > tnlSpmvBenchmarkPrecision( error ) )
-         benchmarkWasSuccesful = false;*/
-
-   }
-   //cout << "First error was on " << firstErrorOccurence << std::endl;
-
-   double flops = 2.0 * iterations * matrix.getNumberOfNonzeroMatrixElements();
-   this->gflops = flops / time * 1.0e-9;
-   artificialZeros = matrix.getNumberOfMatrixElements() - matrix.getNumberOfNonzeroMatrixElements();
-
-   if( verbose )
-      writeProgress();
-}
-
-template< typename Matrix >
-void tnlSpmvBenchmarkBase< Matrix >::writeProgressTableHeader()
-{
-   int totalWidth = this->formatColumnWidth +
-                    this->timeColumnWidth +
-                    this->iterationsColumnWidth +
-                    this->gflopsColumnWidth +
-                    this->benchmarkStatusColumnWidth +
-                    this->infoColumnWidth;
-
-  std::cout << left << std::setw( this->formatColumnWidth - 5 ) << "MATRIX FORMAT"
-        << left << std::setw( 5 ) << "BLOCK"
-        << right << std::setw( this->timeColumnWidth ) << "TIME"
-        << right << std::setw( this->iterationsColumnWidth ) << "ITERATIONS"
-        << right << std::setw( this->gflopsColumnWidth ) << "GFLOPS"
-        << right << std::setw( this->benchmarkStatusColumnWidth ) << "CHECK"
-        << left << std::setw(  this->infoColumnWidth ) << " INFO" << std::endl
-        << setfill( '-' ) << std::setw( totalWidth ) << "--" << std::endl
-        << setfill( ' ');
-}
-
-template< typename Matrix >
-String tnlSpmvBenchmarkBase< Matrix > :: getBgColorBySpeedUp( const double& speedUp ) const
-{
-   if( speedUp >= 30.0 )
-      return String( "#FF9900" );
-   if( speedUp >= 25.0 )
-      return String( "#FFAA00" );
-   if( speedUp >= 20.0 )
-      return String( "#FFBB00" );
-   if( speedUp >= 15.0 )
-      return String( "#FFCC00" );
-   if( speedUp >= 10.0 )
-      return String( "#FFDD00" );
-   if( speedUp >= 5.0 )
-      return String( "#FFEE00" );
-   if( speedUp >= 1.0 )
-      return String( "#FFFF00" );
-   return String( "#FFFFFF" );
-}
-
-
-template< typename Matrix >
-bool tnlSpmvBenchmarkBase< Matrix > :: printMatrixInHtml( const String& fileName,
-                                                          Matrix< RealType, Devices::Host, IndexType >& matrix ) const
-{
-   //cout << "Writing to file " << fileName << std::endl;
-   std::fstream file;
-   file. open( fileName. getString(), std::ios::out );
-   if( ! file )
-   {
-      std::cerr << "I am not able to open the file " << fileName << std::endl;
-      return false;
-   }
-   file << "<html>" << std::endl;
-   file << "   <body>" << std::endl;
-   matrix. printOut( file, "html" );
-   file << "   </body>" << std::endl;
-   file << "</html>" << std::endl;
-   file. close();
-   return true;
-}
-
-#endif /* TNLSPMVBENCHMARKBASE_IMPL_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCSRMatrix.h
deleted file mode 100644
index 829284d6ebd52ffcbffca9546b830d2e081c45d5..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCSRMatrix.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmarkCSR.h  -  description
-                             -------------------
-    begin                : May 15, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKCSRMATRIX_H_
-#define TNLSPMVBENCHMARKCSRMATRIX_H_
-
-#include "tnlSpmvBenchmark.h"
-#include <TNL/Matrices/CSR.h>
-
-template< typename Real, typename Index>
-class tnlSpmvBenchmarkCSR : public tnlSpmvBenchmark< Real, Devices::Host, Index, CSR >
-{
-   public:
-
-   bool setup( const CSR< Real, Devices::Host, Index >& matrix );
-
-   void tearDown();
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                         const double& csrGflops,
-                         const String& inputMtxFile,
-                         const CSR< Real, Devices::Host, Index >& csrMatrix,
-                         bool writeMatrixInfo  ) const;
-   Real getForwardBackwardDifference() const;
-
-   protected:
-
-   /*!***
-    * This measures the difference between SpMV result when used forward or bakward
-    * matrix columns ordering.
-    */
-   Real forwardBackwardDifference;
-};
-
-template< typename Real, typename Index>
-bool tnlSpmvBenchmarkCSR< Real, Index > :: setup( const CSR< Real, Devices::Host, Index >& matrix )
-{
-   this->matrix = matrix;
-
-   const Index size = matrix. getSize();
-   Vector< Real, Devices::Host > refX( "ref-x", size ), refB( "ref-b", size), backwardRefB( "backwardRef-b", size);
-   refX. setValue( 1.0 );
-   this->matrix. vectorProduct( refX, refB );
-   this->matrix. setBackwardSpMV( true );
-   this->matrix. vectorProduct( refX, backwardRefB );
-   this->matrix. setBackwardSpMV( false );
-   Real error( 0.0 ), maxError( 0.0 );
-   for( Index j = 0; j < refB. getSize(); j ++ )
-   {
-      if( refB[ j ] != 0.0 && backwardRefB[ j ] != 0.0 )
-         error = ( Real ) fabs( refB[ j ] - backwardRefB[ j ] ) / min( ( Real ) fabs( refB[ j ] ), ( Real ) fabs( backwardRefB[ j ] ) );
-      else
-         error = max( ( Real ) fabs( refB[ j ] ), ( Real ) fabs( backwardRefB[ j ] ) );
-      maxError = max( error, maxError );
-   }
-   forwardBackwardDifference = maxError;
-   this->setupOk = true;
-   return true;
-}
-
-template< typename Real, typename Index>
-void tnlSpmvBenchmarkCSR< Real, Index > :: tearDown()
-{
-   this->matrix. setSize( 0 );
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkCSR< Real, Index > :: writeProgress() const
-{
-  std::cout << left << std::setw( this->formatColumnWidth ) << "CSR";
-   //  std::cout << left << std::setw( 25 ) << matrixFormat << std::setw( 5 ) << cudaBlockSize;
-  std::cout << right << std::setw( this->timeColumnWidth ) << std::setprecision( 2 ) << this->getTime()
-        << right << std::setw( this->iterationsColumnWidth ) << this->getIterations()
-        << right << std::setw( this->gflopsColumnWidth ) << std::setprecision( 2 ) << this->getGflops();
-   if( this->getBenchmarkWasSuccesful() )
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << " OK - SpMV diff. " << getForwardBackwardDifference();
-   else
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << " FAILED ";
-  std::cout << std::endl;
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkCSR< Real, Index > :: writeToLogTable( std::ostream& logFile,
-                                                                  const double& csrGflops,
-                                                                  const String& inputMtxFile,
-                                                                  const CSR< Real, Devices::Host, Index >& csrMatrix,
-                                                                  bool writeMatrixInfo  ) const
-{
-   if( this->getBenchmarkWasSuccesful() )
-   {
-      logFile << "             <td> " << this->getTime() << "</font></td>" << std::endl;
-      logFile << "             <td> " << this->getGflops() << "</td>" << std::endl;
-   }
-   else
-   {
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-   }
-}
-
-template< typename Real,
-          typename Index >
-Real tnlSpmvBenchmarkCSR< Real, Index > :: getForwardBackwardDifference() const
-{
-   return forwardBackwardDifference;
-}
-
-#endif /* TNLSPMVBENCHMARKCSRMATRIX_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCusparseCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCusparseCSRMatrix.h
deleted file mode 100644
index 582cc5b9def78301525a8b34929cfc33f2fa210e..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkCusparseCSRMatrix.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmarkCusparseCSR.h  -  description
-                             -------------------
-    begin                : Feb 16, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKCUSPARSECSRMATRIX_H_
-#define TNLSPMVBENCHMARKCUSPARSECSRMATRIX_H_
-
-#include "tnlSpmvBenchmark.h"
-#include <TNL/tnlConfig.h>
-#include <TNL/legacy/matrices/tnlCusparseCSR.h>
-
-template< typename Real, typename Index>
-class tnlSpmvBenchmarkCusparseCSR : public tnlSpmvBenchmark< Real, Devices::Cuda, Index, tnlCusparseCSR >
-{
-   public:
-   tnlSpmvBenchmarkCusparseCSR();
-
-   bool setup( const CSR< Real, Devices::Host, Index >& matrix );
-
-   void tearDown();
-
-   Index getArtificialZeros() const;
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                         const double& csrGflops,
-                         const String& inputMtxFile,
-                         const CSR< Real, Devices::Host, Index >& csrMatrix,
-                         bool writeMatrixInfo  ) const;
-
-   void setNonzeroElements( const Index nonzeroElements );
-};
-
-template< typename Real, typename Index>
-bool tnlSpmvBenchmarkCusparseCSR< Real, Index > :: setup( const CSR< Real, Devices::Host, Index >& matrix )
-{
-   if( ! this->matrix. copyFrom( matrix ) )
-      return false;
-   this->setupOk = true;
-   return true;
-}
-
-template< typename Real,
-          typename Index>
-void tnlSpmvBenchmarkCusparseCSR< Real, Index > :: tearDown()
-{
-   this->matrix. reset();
-}
-
-template< typename Real,
-          typename Index>
-Index tnlSpmvBenchmarkCusparseCSR< Real, Index > :: getArtificialZeros() const
-{
-   return 0;
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkCusparseCSR< Real, Index > :: writeProgress() const
-{
-  std::cout << left << std::setw( this->formatColumnWidth ) << "Cusparse";
-   //  std::cout << left << std::setw( 25 ) << matrixFormat << std::setw( 5 ) << cudaBlockSize;
-  std::cout << right << std::setw( this->timeColumnWidth ) << std::setprecision( 2 ) << this->getTime()
-        << right << std::setw( this->iterationsColumnWidth ) << this->getIterations()
-        << right << std::setw( this->gflopsColumnWidth ) << std::setprecision( 2 ) << this->getGflops();
-   if( this->getBenchmarkWasSuccesful() )
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "OK ";
-   else
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "  FAILED - maxError is " << this->maxError << ". ";
-#ifndef HAVE_CUSP
-  std::cout << "CUSPARSE library is missing.";
-#endif
-  std::cout << std::endl;
-}
-
-template< typename Real,
-          typename Index >
-tnlSpmvBenchmarkCusparseCSR< Real, Index > :: tnlSpmvBenchmarkCusparseCSR()
-{
-
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkCusparseCSR< Real, Index > :: writeToLogTable( std::ostream& logFile,
-                                                                       const double& csrGflops,
-                                                                       const String& inputMtxFile,
-                                                                       const CSR< Real, Devices::Host, Index >& csrMatrix,
-                                                                       bool writeMatrixInfo  ) const
-{
-   if( this->getBenchmarkWasSuccesful() )
-   {
-      double speedUp = this->getGflops() / csrGflops;
-      String bgColor = this->getBgColorBySpeedUp( speedUp );
-      logFile << "             <td bgcolor=" << bgColor << ">" << this->getTime() << "</td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << ">" << this->getGflops() << "</td>" << std::endl;
-
-      logFile << "             <td bgcolor=" << bgColor << "> " << speedUp << "</td>" << std::endl;
-   }
-   else
-   {
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-   }
-}
-
-#endif /* TNLSPMVBENCHMARKCUSPARSECSRMATRIX_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
deleted file mode 100644
index 71694e0f561aef3b87feda04fb24ce41487d92f3..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmarkHybridMatrix.h  -  description
-                             -------------------
-    begin                : May 15, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKHYBRIDMATRIX_H_
-#define TNLSPMVBENCHMARKHYBRIDMATRIX_H_
-
-#include "tnlSpmvBenchmark.h"
-#include <TNL/tnlConfig.h>
-#ifdef HAVE_CUSP
-   #include <cusp/hyb_matrix.h>
-   #include <cusp/io/matrix_market.h>
-   #include <cusp/multiply.h>
-   #include <cusp/print.h>
-#endif
-
-
-template< typename Real, typename Index>
-class tnlSpmvBenchmarkHybridMatrix : public tnlSpmvBenchmark< Real, Devices::Host, Index, CSR >
-{
-   public:
-
-   void setFileName( const String& fileName );
-
-   bool setup( const CSR< Real, Devices::Host, Index >& matrix );
-
-   void tearDown();
-
-   void runBenchmark( const Vector< Real, Devices::Host, Index >& x,
-                      const Vector< Real, Devices::Host, Index >& refB,
-                      bool verbose );
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                         const double& csrGflops,
-                         const String& inputMtxFile,
-                         const CSR< Real, Devices::Host, Index >& csrMatrix,
-                         bool writeMatrixInfo  ) const;
-
-   void setNonzeroElements( const Index nonzeroElements );
-
-   protected:
-
-   String fileName;
-
-   Index nonzeroElements;
-};
-
-template< typename Real, typename Index>
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: setFileName( const String& fileName )
-{
-   this->fileName = fileName;
-}
-
-template< typename Real, typename Index>
-bool tnlSpmvBenchmarkHybridMatrix< Real, Index > :: setup( const CSR< Real, Devices::Host, Index >& matrix )
-{
-   return true;
-}
-
-template< typename Real,
-          typename Index>
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: tearDown()
-{
-
-}
-
-template< typename Real,
-          typename Index>
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: runBenchmark( const Vector< Real, Devices::Host, Index >& _x,
-                                                                  const Vector< Real, Devices::Host, Index >& refB,
-                                                                  bool verbose )
-{
-   this->benchmarkWasSuccesful = false;
-#ifdef HAVE_CUSP
-   try
-   {
-      // create an empty sparse matrix structure (HYB format)
-      cusp::hyb_matrix< Index, Real, cusp::device_memory > A;
-
-      // load a matrix stored in MatrixMarket format
-      cusp::io::read_matrix_market_file( A, this->fileName. getString() );
-
-      // allocate storage for solution (x) and right hand side (b)
-      cusp::array1d< Real, cusp::host_memory > host_x( A.num_rows, 1 );
-      cusp::array1d< Real, cusp::device_memory > x( A.num_rows, 1 );
-      cusp::array1d< Real, cusp::device_memory > b( A.num_rows, 0 );
-
-      for( Index j = 0; j < refB. getSize(); j ++ )
-         host_x[ j ] = _x[ j ];
-
-      x = host_x;
-
-      TimerRT rt_timer;
-      rt_timer. Reset();
-
-      this->iterations = 0;
-      //while( rt_timer. getTime() < time )
-      {
-         for( int i = 0; i < this->maxIterations; i ++ )
-         {
-            cusp :: multiply( A, x, b );
-            cudaThreadSynchronize();
-            this->iterations ++;
-         }
-      }
-      this->time = rt_timer. getTime();
-
-      cusp::array1d< Real, cusp::host_memory > host_b( b );
-      host_b = b;
-
-      for( Index j = 0; j < refB. getSize(); j ++ )
-      {
-         //f << refB[ j ] << " - " << host_b[ j ] << " = "  << refB[ j ] - host_b[ j ] <<  std::endl;
-         if( refB[ j ] != 0.0 )
-            this->maxError = max( this->maxError, ( Real ) fabs( refB[ j ] - host_b[ j ] ) /  ( Real ) fabs( refB[ j ] ) );
-         else
-            this->maxError = max( this->maxError, ( Real ) fabs( refB[ j ] ) );
-      }
-      //if( this->maxError < 1.0 )
-         this->benchmarkWasSuccesful = true;
-      //else
-      //   this->benchmarkWasSuccesful = false;
-
-
-      double flops = 2.0 * this->iterations * this->nonzeroElements;
-      this->gflops = flops / this->time * 1.0e-9;
-
-   }
-   catch( std::bad_alloc )
-   {
-      writeProgress();
-      return;
-   }
-#else
-   this->benchmarkWasSuccesful = false;
-#endif
-   writeProgress();
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: writeProgress() const
-{
-  std::cout << left << std::setw( this->formatColumnWidth ) << "Hybrid";
-   //  std::cout << left << std::setw( 25 ) << matrixFormat << std::setw( 5 ) << cudaBlockSize;
-  std::cout << right << std::setw( this->timeColumnWidth ) << std::setprecision( 2 ) << this->getTime()
-        << right << std::setw( this->iterationsColumnWidth ) << this->getIterations()
-        << right << std::setw( this->gflopsColumnWidth ) << std::setprecision( 2 ) << this->getGflops();
-   if( this->getBenchmarkWasSuccesful() )
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "OK ";
-   else
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "  FAILED - maxError is " << this->maxError << ". ";
-#ifndef HAVE_CUSP
-  std::cout << "CUSP library is missing.";
-#endif
-  std::cout << std::endl;
-}
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: writeToLogTable( std::ostream& logFile,
-                                                                     const double& csrGflops,
-                                                                     const String& inputMtxFile,
-                                                                     const CSR< Real, Devices::Host, Index >& csrMatrix,
-                                                                     bool writeMatrixInfo  ) const
-{
-   if( this->getBenchmarkWasSuccesful() )
-   {
-      double speedUp = this->getGflops() / csrGflops;
-      String bgColor = this->getBgColorBySpeedUp( speedUp );
-      logFile << "             <td bgcolor=" << bgColor << ">" << this->getTime() << "</td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << ">" << this->getGflops() << "</td>" << std::endl;
-
-      logFile << "             <td bgcolor=" << bgColor << "> " << speedUp << "</td>" << std::endl;
-   }
-   else
-   {
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-   }
-}
-
-
-template< typename Real,
-          typename Index >
-void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: setNonzeroElements( const Index nonzeroElements )
-{
-   this->nonzeroElements = nonzeroElements;
-}
-
-#endif /* TNLSPMVBENCHMARKHYBRIDMATRIX_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h
deleted file mode 100644
index 6327ac95d659ebc6592d458f27288a11cf34d141..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h
+++ /dev/null
@@ -1,237 +0,0 @@
- /***************************************************************************
-                          tnlSpmvBenchmarkRgCSR.h  -  description
-                             -------------------
-    begin                : May 15, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARKRGCSRMATRIX_H_
-#define TNLSPMVBENCHMARKRGCSRMATRIX_H_
-
-#include "tnlSpmvBenchmark.h"
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-
-template< typename Real, typename Device, typename Index>
-class tnlSpmvBenchmarkRgCSR : public tnlSpmvBenchmark< Real, Device, Index, tnlRgCSR >
-{
-   public:
-
-   tnlSpmvBenchmarkRgCSR();
-
-   bool setup( const CSR< Real, Devices::Host, Index >& matrix );
-
-   void tearDown();
-
-   void writeProgress() const;
-
-   void writeToLogTable( std::ostream& logFile,
-                         const double& csrGflops,
-                         const String& inputMtxFile,
-                         const CSR< Real, Devices::Host, Index >& csrMatrix,
-                         bool writeMatrixInfo ) const;
-
-   void setGroupSize( const Index groupSize );
-
-   void setCudaBlockSize( const Index cudaBlockSize );
-
-   void setUseAdaptiveGroupSize( bool useAdaptiveGroupSize );
-
-   void setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategy adaptiveGroupSizeStrategy );
-
-   Index getArtificialZeroElements() const;
-
-   protected:
-
-   Index groupSize;
-
-   Index cudaBlockSize;
-
-   bool useAdaptiveGroupSize;
-
-   tnlAdaptiveGroupSizeStrategy adaptiveGroupSizeStrategy;
-};
-
-template< typename Real,
-          typename Device,
-          typename Index>
-tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: tnlSpmvBenchmarkRgCSR()
- : groupSize( 0 ),
-   cudaBlockSize( 0 ),
-   useAdaptiveGroupSize( false ),
-   adaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategyByAverageRowSize )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index>
-bool tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setup( const CSR< Real, Devices::Host, Index >& csrMatrix )
-{
-   TNL_ASSERT( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
-   if( Device :: getDevice() == Devices::HostDevice )
-   {
-      this->matrix. tuneFormat( groupSize,
-                                  this->useAdaptiveGroupSize,
-                                  this->adaptiveGroupSizeStrategy );
-      if( ! this->matrix. copyFrom( csrMatrix ) )
-         return false;
-   }
-   if( Device :: getDevice() == Devices::CudaDevice )
-   {
-#ifdef HAVE_CUDA
-      tnlRgCSR< Real, Devices::Host, Index > hostMatrix( "tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setup : hostMatrix" );
-      hostMatrix. tuneFormat( groupSize,
-                              this->useAdaptiveGroupSize,
-                              this->adaptiveGroupSizeStrategy );
-      hostMatrix. copyFrom( csrMatrix );
-      if( ! this->matrix. copyFrom( hostMatrix ) )
-         return false;
-#else
-      return false;
-#endif
-   }
-   this->setupOk = true;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index>
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: tearDown()
-{
-   this->matrix. reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: writeProgress() const
-{
-  std::cout << left << std::setw( this->formatColumnWidth - 15 ) << "Row-grouped CSR ";
-   if( Device :: getDevice() == Devices::CudaDevice )
-   {
-      if( useAdaptiveGroupSize )
-        std::cout << std::setw( 5 ) << "Var.";
-      else
-        std::cout << std::setw( 5 ) << this->groupSize;
-     std::cout << std::setw( 10 ) << this->cudaBlockSize;
-   }
-   else
-   {
-      if( useAdaptiveGroupSize )
-        std::cout << std::setw( 15 ) << "Var.";
-      else
-        std::cout << std::setw( 15 ) << this->groupSize;
-   }
-  std::cout << right << std::setw( this->timeColumnWidth ) << std::setprecision( 2 ) << this->getTime()
-        << right << std::setw( this->iterationsColumnWidth ) << this->getIterations()
-        << right << std::setw( this->gflopsColumnWidth ) << std::setprecision( 2 ) << this->getGflops();
-   if( this->getBenchmarkWasSuccesful() )
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "  OK  - maxError is " << this->maxError << ". ";
-   else
-       std::cout << right << std::setw( this->benchmarkStatusColumnWidth ) << "  FAILED - maxError is " << this->maxError << ". ";
-#ifndef HAVE_CUDA
-   if( Device :: getDevice() == Devices::CudaDevice )
-      throw Exceptions::CudaSupportMissing();
-#endif
-     std::cout << std::endl;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setGroupSize( const Index groupSize )
-{
-   this->groupSize = groupSize;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setCudaBlockSize( const Index cudaBlockSize )
-{
-   this->matrix. setCUDABlockSize( cudaBlockSize );
-   this->cudaBlockSize = cudaBlockSize;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setUseAdaptiveGroupSize( bool useAdaptiveGroupSize )
-{
-   this->useAdaptiveGroupSize = useAdaptiveGroupSize;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setAdaptiveGroupSizeStrategy( tnlAdaptiveGroupSizeStrategy adaptiveGroupSizeStrategy )
-{
-   this->adaptiveGroupSizeStrategy = adaptiveGroupSizeStrategy;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: getArtificialZeroElements() const
-{
-   return this->matrix. getArtificialZeroElements();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: writeToLogTable( std::ostream& logFile,
-                                                                            const double& csrGflops,
-                                                                            const String& inputMtxFile,
-                                                                            const CSR< Real, Devices::Host, Index >& csrMatrix,
-                                                                            bool writeMatrixInfo ) const
-{
-   String bgColor;
-   switch( groupSize )
-   {
-      case 16: bgColor = "#5555FF"; break;
-      case 32: bgColor = "#9999FF"; break;
-      case 64: bgColor = "#CCCCFF"; break;
-      default: bgColor = "#FFFFFF";
-   }
-   if( writeMatrixInfo )
-   {
-      String baseFileName( inputMtxFile );
-      baseFileName += String( ".rgcsr-");
-      baseFileName += String( groupSize );
-      String matrixPdfFile( baseFileName );
-      matrixPdfFile += String( ".pdf" );
-      String matrixHtmlFile( baseFileName );
-      matrixHtmlFile += String( ".html" );
-      tnlRgCSR< Real > rgCsrMatrix( inputMtxFile );
-      rgCsrMatrix. tuneFormat( this->groupSize,
-                               this->useAdaptiveGroupSize,
-                               this->adaptiveGroupSizeStrategy );
-      rgCsrMatrix. copyFrom( csrMatrix );
-      this->printMatrixInHtml( matrixHtmlFile, rgCsrMatrix );
-      logFile << "             <td bgcolor=" << bgColor << "> <a href=\"" << matrixPdfFile << "\">PDF</a>,<a href=\"" << matrixHtmlFile << "\"> HTML</a></td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << "> " << this->getArtificialZeroElements() << "</td>" << std::endl;
-   }
-   if( this->getBenchmarkWasSuccesful() )
-   {
-      const double speedUp = this->getGflops() / csrGflops;
-      bgColor =  this->getBgColorBySpeedUp( speedUp );
-      logFile << "             <td bgcolor=" << bgColor << ">" << this->getTime() << "</td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << "> " << this->getGflops() << "</td>" << std::endl;
-      logFile << "             <td bgcolor=" << bgColor << "> " << speedUp << "</td>" << std::endl;
-   }
-   else
-   {
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-      logFile << "             <td bgcolor=#FF0000> N/A </td>" << std::endl;
-   }
-}
-
-
-#endif /* TNLSPMVBENCHMARKRGCSRMATRIX_H_ */
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmark_impl.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmark_impl.h
deleted file mode 100644
index 24bc0dca0ce5c0fe82018d29d8790a10c3bdf63c..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmark_impl.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-                          tnlSpmvBenchmark_impl.h  -  description
-                             -------------------
-    begin                : Dec 29, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLSPMVBENCHMARK_IMPL_H_
-#define TNLSPMVBENCHMARK_IMPL_H_
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool tnlSpmvBenchmark< CSR< Real, Device, Index > >::setup( const CSR< RealType, Devices::Host, IndexType >& matrix )
-{
-   this->matrix = matrix;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmark< CSR< Real, Device, Index > >::tearDown()
-{
-   this->matrix.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmark< CSR< Real, Device, Index > >::writeProgress() const
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlSpmvBenchmark< CSR< Real, Device, Index > >::writeToLogTable( std::ostream& logFile,
-                                                                               const double& csrGflops,
-                                                                               const String& inputMtxFile,
-                                                                               const CSR< RealType, Devices::Host, IndexType >& csrMatrix,
-                                                                               bool writeMatrixInfo  ) const
-{
-
-}
-
-#endif /* TNLSPMVBENCHMARK_IMPL_H_ */
diff --git a/src/TNL/legacy/curve/Curve.h b/src/TNL/legacy/curve/Curve.h
deleted file mode 100644
index f6f0408db82016340bbce152de4fdeb3111ab4f8..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/curve/Curve.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/***************************************************************************
-                          Curve.h  -  description
-                             -------------------
-    begin                : 2007/06/27
-    copyright            : (C) 2007 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <iomanip>
-#include <fstream>
-#include <cstring>
-#include <TNL/Containers/List.h>
-#include <TNL/Object.h>
-#include <TNL/Math.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/param-types.h>
-
-namespace TNL {
-
-//! Basic structure for curves
-template< class T >
-class CurveElement
-{
-   public:
-   CurveElement() {};
-
-   CurveElement( const T& pos,
-                  bool _speparator = false )
-      : position( pos ),
-        separator( _speparator ) {};
- 
-   bool save( File& file ) const
-   {
-      if( ! file. write( &position ) )
-         return false;
-      if( ! file. write( &separator ) )
-         return false;
-      return true;
-   };
- 
-   bool load( File& file )
-   {
-      if( ! file. read( &position ) )
-         return false;
-      if( ! file. read( &separator ) )
-         return false;
-      return true;
-   };
- 
-   T position;
- 
-   bool separator;
-};
-
-template< class T >
-class Curve
- : public Object,
-   public Containers::List< CurveElement< T > >
-{
-   public:
-   //! Basic contructor
-   Curve( const char* name )
-   : Object()
-// FIXME: name property has been removed from Object
-//   : Object( name )
-   {
-   };
-
-   //! Destructor
-   ~Curve()
-   { };
-
-   String getType() const
-   {
-      return String( "Curve< " ) + String( TNL::getType< T >() ) + String( " >" );
-   };
-
-   //! Append new point
-   void Append( const T& vec, bool separator = false )
-   {
-      Containers::List< CurveElement< T > > :: Append( CurveElement< T >( vec, separator ) );
-   };
-
-   //! Erase the curve
-   void Erase()
-   {
-      Containers::List< CurveElement< T > >::reset();
-   };
- 
-   //! Method for saving the object to a file as a binary data
-   bool save( File& file ) const
-   {
-      if( ! Object :: save( file ) ) return false;
-      if( ! Containers::List< CurveElement< T > > :: DeepSave( file ) ) return false;
-      return true;
-   };
-
-   //! Method for restoring the object from a file
-   bool load( File& file )
-   {
-      if( ! Object :: load( file ) ) return false;
-      if( ! Containers::List< CurveElement< T > > :: DeepLoad( file ) ) return false;
-      return true;
-   };
-
-   //! Method for saving the object to a file as a binary data
-   bool save( const String& fileName ) const
-   {
-      return Object :: save( fileName );
-   };
-
-   //! Method for restoring the object from a file
-   bool load( const String& fileName )
-   {
-      return Object :: load( fileName );
-   };
-
-};
-
-template< class T > bool Write( const Curve< T >& curve,
-                                std::ostream& str,
-                                const char* format,
-                                const int step = 1 )
-{
-   if( ! format )
-   {
-      std::cerr << "No format given for drawing 2D grid. " << std::endl;
-      return false;
-   }
-   if( curve. isEmpty() )
-   {
-      std::cerr << "Unable to draw curve, it's empty!" << std::endl;
-      return false;
-   }
-   if( strcmp( format, "gnuplot" ) == 0 )
-   {
-      const int size = curve. getSize();
-      int i, j;
-      for( i = 0; i < size; i += step )
-      {
-         if( curve[ i ]. separator )
-            str << std::endl;
-         else
-            str << std::setprecision( 12 )
-                << curve[ i ]. position[ 0 ] << " "
-                << curve[ i ]. position[ 1 ] << std::endl;
-         for( j = 0; j < step; j ++ )
-            if( curve[ i + j ]. separator ) str << std::endl;
-      }
-      return true;
-   }
-   std::cerr << "Unknown format '" << format << "' for drawing a curve." << std::endl;
-   return false;
-};
-
-template< class T > bool Write( const Curve< T >& curve,
-                                const char* file_name,
-                                const char* format,
-                                const int step = 1 )
-{
-
-   if( strncmp( format, "tnl",3 ) == 0 )
-   {
-      File file;
-      if( ! file. open( String( file_name ) + String( ".tnl" ), IOMode::write ) )
-      {
-         std::cerr << "I am not able to open the file " << file_name << " for drawing curve." << std::endl;
-         return false;
-      }
-      if( ! curve. save( file ) )
-      {
-         std::cerr << "I am not able to write to the file " << file_name << " for drawing grid." << std::endl;
-         return false;
-      }
-      file. close();
-   }
-   else
-   {
-      std::fstream file;
-      file. open( file_name, std::ios::out );
-      if( ! file )
-      {
-         std::cerr << "I am not able to to open the file " << file_name << " for drawing curve." << std::endl;
-         return false;
-      }
-      bool result = Write( curve, file, format, step );
-      file. close();
-      if( ! result )
-      {
-         std::cerr << "Sorry I could not write to the file " << file_name << std::endl;
-         return false;
-      }
-   }
-   return true;
-};
-
-template< class T > bool Read( Curve< T >& crv,
-                               const char* input_file )
-{
-   File file;
-   if( ! file. open( String( input_file ), IOMode::read  ) )
-   {
-     std::cout << " unable to open file " << input_file << std::endl;
-      return false;
-   }
-   if( ! crv. load( file ) )
-   {
-     std::cout << " unable to restore the data " << std::endl;
-      return false;
-   }
-   file. close();
-   return true;
-}
-
-// Explicit instatiation
-template class Curve< Containers::StaticVector< 2, double > >;
-
-} // namespace TNL
-
diff --git a/src/TNL/legacy/curve/tnlcurve2gnuplot.cpp b/src/TNL/legacy/curve/tnlcurve2gnuplot.cpp
deleted file mode 100644
index 5a1e297d60973b8a1a6f8f7f053732cdcce9fb0c..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/curve/tnlcurve2gnuplot.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/***************************************************************************
-                          tnlCurve2gnuplot.cpp  -  description
-                             -------------------
-    begin                : 2007/12/16
-    copyright            : (C) 2007 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Curve.h>
-#include <TNL/Containers/StaticVector.h>
-
-using namespace TNL;
-
-void setupConfig( Config::ConfigDescription& config )
-{
-   config.addDelimiter                            ( "General settings:" );
-   config.addRequiredList< String >(  "input-files", "Input files." );
-   config.addList< String >( "output-files", "Output files." );
-   config.addEntry< int >( "output-step", "Decrease number of the output curve nodes." );
-   config.addEntry< String >( "output-file-format", "Output file format. Can be gnuplot.", "gnuplot" );
-}
-
-//--------------------------------------------------------------------------
-int main( int argc, char* argv[] )
-{
-   Config::ParameterContainer parameters;
-   Config::ConfigDescription conf_desc;
- 
-   setupConfig( conf_desc );
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return 1;
-   }
-
-   Containers::List< String > input_files = parameters. getParameter< Containers::List< String > >( "input-files" );
-   Containers::List< String > output_files;
-   if( ! parameters. getParameter< Containers::List< String > >( "output-files", output_files ) )
-      std::cout << "No output files were given." << std::endl;
-   int output_step( 1 );
-   parameters. getParameter< int >( "output-step", output_step );
-   String output_file_format = parameters. getParameter< String >( "output-file-format" );
-
-   int size = input_files. getSize();
-   /*if( size != output_files. getSize() )
-   {
-      std::cerr << "Sorry, there is different number of input and output files." << std::endl;
-      return 1;
-   }*/
-   int i;
-   Curve< Containers::StaticVector< 2, double > > crv( "tnlcurve2gnuplot:curve" );
-   for( i = 0; i < size; i ++ )
-   {
-      const char* input_file = input_files[ i ]. getString();
-      std::cout << "Processing file " << input_file << " ... " << std::flush;
- 
-      File file;
-      if( ! file. open( input_files[ i ], IOMode::read ) )
-      {
-         std::cout << " unable to open file " << input_files[ i ] << std::endl;
-         continue;
-      }
-      if( ! crv. load( file ) )
-      {
-         std::cout << " unable to restore the data " << std::endl;
-         continue;
-      }
-      file. close();
-
-      Curve< Containers::StaticVector< 2, double > > out_crv( "tnlcurve2gnuplot:outcurve" );
-      const int size = crv. getSize();
-      int i;
-      for( i = 0; i < size; i += output_step )
-      {
-         out_crv. Append( crv[ i ]. position, crv[ i ]. separator );
-         //StaticVector< 2, double > v = crv[ i ]. position;
-         //v[ 0 ] = u( i );
-         //v[ 1 ] = u( i + 1 );
-         //out_crv. Append( v );
-      }
-
-      String output_file_name;
-      if( ! output_files. isEmpty() ) output_file_name = output_files[ i ];
-      else
-      {
-         if( output_file_format == "gnuplot" )
-            output_file_name += ".gplt";
-      }
-      std::cout << " writing... " << output_file_name << std::endl;
-      if( ! Write( out_crv, output_file_name. getString(), output_file_format. getString() ) )
-      {
-         std::cerr << " unable to write to " << output_file_name << std::endl;
-      }
-   }
-}
diff --git a/src/TNL/legacy/incompressible-navier-stokes/CMakeLists.txt b/src/TNL/legacy/incompressible-navier-stokes/CMakeLists.txt
deleted file mode 100755
index 1cf94862f06974476a4a48ab83fccb97b269a27d..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-set( tnl_incompressible_navier_stokes_SOURCES     
-     tnl-incompressible-navier-stokes.cpp
-tnlExplicitINSTimeStepper_impl.h
-tnlExplicitINSTimeStepper.h
-tnlIncompressibleNavierStokesProblem_impl.h
-tnlIncompressibleNavierStokesProblem.h
-tnlNSFastBuildConfig.h
-visit_writer.h
-visit_writer.cpp
-solver.h
-base.h
-      )
-               
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(tnl-incompressible-navier-stokes${debugExt} tnl-incompressible-navier-stokes.cu)
-   
-ELSE(  BUILD_CUDA )               
-   ADD_EXECUTABLE(tnl-incompressible-navier-stokes${debugExt} ${tnl_incompressible_navier_stokes_SOURCES})
-   
-ENDIF( BUILD_CUDA )
-
-target_link_libraries (tnl-incompressible-navier-stokes${debugExt} tnl${debugExt}-${tnlVersion} )
-
-INSTALL( TARGETS tnl-incompressible-navier-stokes${debugExt}
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES tnl-run-incompressible-navier-stokes
-#         DESTINATION share/tnl-${tnlVersion}/examples/incompressible-navier-stokes )
diff --git a/src/TNL/legacy/incompressible-navier-stokes/base.h b/src/TNL/legacy/incompressible-navier-stokes/base.h
deleted file mode 100644
index 725d275f6b5a96c79fb8a2164f80ee6b443db62e..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/base.h
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef __BASE_H__
-#define __BASE_H__
-#define USE_CUDA 0
-
-#include <cmath>
-#include "base_structs.h"
-
-inline bool Equal(double a, double b)
-{
-	a = abs(a); b = abs(b);
-	double bigger = (a> b)?a: b;
-	if (bigger < 1e-8) return true;
-	double diff = a-b;
-	if (diff < 0) diff *=-1;
-	if (diff < 0.0000001*bigger || diff < 0.0000001) return true;
-
-	if (std::isnan((double) a) && std::isnan((double) b)) return true;
-	if (std::isinf(a) && std::isinf(b)) return true;
-	return false;
-}
-
-bool CheckResults(ArrayD &a, ArrayD &b)
-{
-	if (a.size() != b.size()) {printf("Array sizes don't match\n"); return false;}
-	int count = 0;
-	ArrayD aa, ba;
-	ArrayBaseD ac=a, bc=b;
-	if (a._onGPU) {aa.clone(a,false,false); ac=aa;}
-	if (b._onGPU) {ba.clone(b,false,false); bc=ba;}
-	for (int i = 0 ; i < ac.size(); i++)
-		if (!Equal(ac[i], bc[i]))
-		{
-			printf("Chyba na %d mezi %e %e dela %f\n", i, ac[i], bc[i], abs(ac[i]-bc[i])/std::max(ac[i], bc[i]));
-			if (count++ > 20) return false;
-		}
-	if (count==0) printf("Shodne\n");
-	return count==0;
-}
-
-template <class MatrixType> void Mult(const MatrixType &A, const double * vec, double *out)
-{
-	//if (vec.size()< A.num_cols()) throw "CSRMatrix::mult - vec vector not big enough";
-
-	int N = A.num_rows();
-	#pragma omp parallel for schedule(static)
-	for (int ri = 0; ri < N; ri++)
-	{
-		out[ri] = 0;
-		for (int i = A.num_in_row(ri)-1; i>=0; i--)
-		{
-			double val=0; int col=0;
-			A.get_el_in_row(ri,i,val,col);
-			out[ri] += val * vec[col];
-		}
-	}
-}
-
-template <class MatrixType>
-__cuda_call__ void JacobiIterKernel(const MatrixType &A, const ArrayBaseD &b, const ArrayBaseD & x, ArrayBaseD & out_x, const double damping, int r)
-{
-	double diag = 1;
-	double nonDiag = 0;
-
-	for (int i = 0; i < A.num_in_row(r); i++)
-	{
-		double aVal; int c;
-		A.get_el_in_row(r,i,aVal,c);
-
-		if (c==r) diag = aVal;
-		else nonDiag += aVal * x[c];
-	}
-	out_x[r] = (1.0 - damping)*x[r] + damping*(b[r] - nonDiag)/diag;
-}
-
-#if USE_CUDA
-template <class MatrixType>
-__global__ void JacobiIterGPU(const MatrixType A, const ArrayBaseD b, const ArrayBaseD x, ArrayBaseD out_x, const double damping)
-{
-  int r = blockIdx.x*blockDim.x + threadIdx.x;
-  if (r < b.size())
-	JacobiIterKernel(A, b, x, out_x, damping, r);
-}
-
-template <class MatrixType> __global__ void ResiduumGPU(const MatrixType A, const ArrayBaseD b, const ArrayBaseD x, double * result)
-{
-	__shared__ float sdata[blockSize];
-	const unsigned int tid = threadIdx.x;
-	sdata[tid] = 0;
-	int r = blockIdx.x*blockDim.x + threadIdx.x;
-	if (r >= A.num_rows()) return;
-	sdata[tid] = ResiduumKernel(A, b, x, r);
-	__syncthreads();
-	for( unsigned int s = blockDim.x/2 ; s > 0 ; s >>= 1 )
-	{
-		if( tid < s ) sdata[tid] += sdata[tid + s];
-		__syncthreads();
-	}
-	if( tid == 0 ) result[blockIdx.x] = sdata[0];
-	//if( tid == 0 ) atomicAdd(result, sdata[0]); //Doesn't work
-}
-#endif
-
-template <class MatrixType> void JacobiIter(const MatrixType &A, const ArrayD &b, const ArrayD & x, ArrayD & out_x, const double damping=1)
-{
-	const int n = A.num_rows();
-	assert(A.num_cols() == n) ;
-	assert(b.size() == n);
-	assert(x.size() >= n);
-	assert(out_x.size() >= n);
-	assert(b._onGPU == x._onGPU && x._onGPU == out_x._onGPU);
-#if USE_CUDA
-	if (x._onGPU)
-	{
-		JacobiIterGPU <<<  gridSize(n), blockSize >>> (A.toKernel(), b, x, out_x, damping);
-	}
-	else
-#endif
-	{
-#pragma omp parallel for schedule(static)
-		for (int r = 0; r < n; r++)
-			JacobiIterKernel(A, b, x, out_x, damping, r);
-	}
-}
-
-
-template <class MatrixType> __cuda_call__ double ResiduumKernel(const MatrixType &A, const ArrayBaseD & b, const ArrayBaseD & x, int r)
-{
-	double res = 0;
-	for (int i = 0; i < A.num_in_row(r); i++)
-	{
-		double aVal = 0; int c = r;
-		A.get_el_in_row(r,i, aVal, c);
-		res += aVal*x[c];
-	}
-	res = b[r] - res;
-	return res*res;
-}
-
-template <class MatrixType> double Residuum(const MatrixType &A, const ArrayD & b, const ArrayD & x)
-{
-	const int n = A.num_rows();
-	assert(A.num_cols() == n) ;
-	assert(b.size() == n);
-	assert(x.size() >= n);
-	assert(b._onGPU == x._onGPU);
-	double res = 0;
-#if USE_CUDA
-	if (x._onGPU)
-	{
-	#if 0
-		static double *resGPU = 0;
-		if (resGPU==0) cudaMalloc(&resGPU, sizeof(double));
-		cudaMemset(resGPU, 0, sizeof(double));
-		ResiduumGPU<MatrixType> <<<  gridSize(n), blockSize >>> (A, b, x, resGPU);
-		cudaDeviceSynchronize();
-		cudaMemcpy(&res, resGPU, sizeof(double), cudaMemcpyDeviceToHost);
-	#else
-		ArrayD resids(gridSize(n), true);
-		resids.fill(0);
-		ResiduumGPU <<<  gridSize(n), blockSize >>> (A.toKernel(), b, x, resids.data);
-		resids.moveToCPU();
-		for (int i = 0; i < resids.size(); i++) res+=resids[i];
-	#endif
-	}
-	else
-#endif //USE_CUDA
-	{
-#pragma omp parallel for reduction(+:res) schedule(static)
-		for (int r = 0; r < n; r++)
-			res+=ResiduumKernel(A, b, x, r);
-	}
-	return sqrt(res);
-}
-#endif
diff --git a/src/TNL/legacy/incompressible-navier-stokes/base_structs.h b/src/TNL/legacy/incompressible-navier-stokes/base_structs.h
deleted file mode 100644
index a60b4b2a666561273c518a6ac3c5c989907e666e..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/base_structs.h
+++ /dev/null
@@ -1,470 +0,0 @@
-#pragma once
-#include <math.h>
-#include <string.h>
-
-#if USE_CUDA
-	#include "base_cuda.h"
-#else
-	void CudaCheckError() {}
-	#define __cuda_call__
-#endif
-
-typedef unsigned int uint;
-template <class T> T abs(const T &x){return (x<0)? -x:x;}
-template <class T> T max(T a, T b){return (a>=b)? a:b;}
-template <class T> T max(T a, T b, T c){return max(a, max(a,b));}
-template <class T> T min(T a, T b){return (a<=b)? a:b;}
-template <class T> inline T square(T x){return x*x;}
-template<class T> inline double clamp(T x){ return x<0 ? 0 : x>1 ? 1 : x; }
-template <class T> inline void clamp(T & val, T min, T max){if (val<min) val=min; if (val>max) val=max;}
-
-struct vec3
-{
-  double x, y, z;
-  vec3(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
-  vec3& operator+=(const vec3 &b) { x += b.x; y += b.y; z += b.z; return *this; }
-  vec3& operator-=(const vec3 &b) { x -= b.x; y -= b.y; z -= b.z; return *this; }
-  vec3 operator+(const vec3 &b) const { return vec3(x+b.x,y+b.y,z+b.z); }
-  vec3 operator-(const vec3 &b) const { return vec3(x-b.x,y-b.y,z-b.z); }
-  vec3 operator*(double b) const { return vec3(x*b,y*b,z*b); }
-  vec3 mult(const vec3 &b) const { return vec3(x*b.x,y*b.y,z*b.z); }
-  double length() const{return sqrt(x*x+y*y+z*z);}
-  vec3& normalize(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
-  vec3 norm() const { vec3 res(*this); res.normalize(); return res; }
-  vec3& clamp(){ ::clamp<double>(x); ::clamp<double>(y); ::clamp<double>(z); return *this; }
-  double dot(const vec3 &b) const { return x*b.x+y*b.y+z*b.z; }
-  // cross:
-  vec3 operator%(const vec3 & b) const {return vec3(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
-  static double Dot(const vec3 & a, const vec3 & b){ return a.dot(b); }
-  static vec3 Cross(const vec3 & a, const vec3 & b){return a%b;}
-  static vec3 Mult (const vec3 & a, const vec3 & b){return a.mult(b);}
-};
-
-struct vec2i
-{
-	int x,y;
-	__cuda_call__ vec2i(){x=y=0;}
-	__cuda_call__ vec2i(int x, int y){this->x=x; this->y=y;}
-};
-
-template <class T> struct ArrayBase
-{
-public:
-
-	int w,h,d; //vector dimensions
-	T * data;
-
-	__cuda_call__ ArrayBase (){w=h=d=0; data=0;}
-	__cuda_call__ ArrayBase (T *data, int w, int h, int d){this->data=data; this->w=w; this->h=h; this->d=d;}
-	__cuda_call__ T & operator [] (int i){return data[i];}
-	__cuda_call__ const T & operator [] (int i) const {return data[i];}
-	__cuda_call__ T & operator() (int x, int y) { return data[y*w+x]; }
-	__cuda_call__ const T & operator() (int x, int y) const { return data[y*w+x]; }
-	__cuda_call__ int size () const {return w*h*d;}
-
-	__cuda_call__ int width() const {return w;}
-	__cuda_call__ int height() const {return h;}
-	__cuda_call__ int depth() const {return d;}
-
-	__cuda_call__ int index(int x, int y) const {return y*w+x;}
-	__cuda_call__ vec2i index2D(int i) const { int y = i/w; return vec2i(i-y*w,y);}
-	operator T* (){return data;}
-
-	void set(const ArrayBase<T>& arr) {set(arr.data, arr.w, arr.h, arr.d);}
-	void set(T *data, int w, int h, int d){this->data=data; this->w=w; this->h=h; this->d=d;}
-};
-typedef ArrayBase<double> ArrayBaseD;
-
-template<typename T>
-class Array : public ArrayBase<T>
-{
-public:
-
-	Array<T>* _bindedFrom; //If case this is only shared array, this points to the parent data array
-	bool _onGPU;
-
-	Array(){ _bindedFrom = 0; _onGPU = false; }
-	Array(int size, bool onGPU = false){ this->_onGPU = onGPU; _bindedFrom = 0; resize1d(size); }
-	Array(const Array<T> & arr){throw "Not supported";}
-	ArrayBase<T> toArr() const {return ArrayBase<T>(this->data,this->w,this->h,this->d);}
-	bool onGPU()const { return _bindedFrom ? _bindedFrom->onGPU() : _onGPU; }
-
-	void copy(const Array<T> & arr)
-	{
-		if (this->size() < arr.size()) throw "Array isn't big enough";
-#if USE_CUDA
-		cudaMemcpyKind copyKind = arr._onGPU? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost;
-		if (_onGPU) copyKind = arr._onGPU? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-		cudaMemcpy(this->data, arr.data, arr.size()* sizeof(T), copyKind);
-#else
-		memcpy(this->data, arr.data,  arr.size()* sizeof(T));
-#endif
-	}
-	void fill (T val)
-	{
-#if USE_CUDA
-		if (_onGPU)
-			FillGPU <<<  gridSize(this->size()), blockSize >>> (this->data, val, this->size());
-		else
-#else
-			for (int i = this->size()-1; i >= 0; i--) this->data[i] = val;
-#endif
-	}
-	void bind1d(Array<T> & arr, int offset, int size){ unbindOrFree(); this->set(&arr[offset], size, 1, 1); _bindedFrom = &arr; }
-	void unbindOrFree(){ if (!_bindedFrom) free(); this->data = 0; this->w = this->h = this->d = 0; _bindedFrom = 0; }
-
-
-	static T* Alloc(int size, bool onGPU)
-	{
-		T * res = 0;
-#if USE_CUDA
-		if (onGPU)
-			cudaMalloc(&res, size*sizeof(T));
-		else
-#else
-			res = (size > 0) ? new T[size] : 0;
-#endif
-		return res;
-	}
-	void free()
-	{
-		if (_bindedFrom) throw "Cant free not own data";
-		if (this->data){
-#if USE_CUDA
-			if (this->_onGPU)
-				cudaFree(this->data);
-			else
-#else
-				delete[] this->data;
-#endif
-		}
-		this->data=0; this->w=this->h=this->d=0; _bindedFrom=0;
-	}
-	void resize(int newSize, bool leaveMore = false)
-	{
-		if ( (!leaveMore && this->size() != newSize) || this->size() < newSize)
-		{
-			free();
-			this->data = Alloc(newSize, this->_onGPU);
-			_bindedFrom = 0;
-		}
-		this->w=newSize; this->h=this->d=1;
-	}
-	void resize1d(int size){resize(size); }
-	void clone(const Array<T> & arr, bool leaveMore, bool moveToGPU)
-	{
-		if ( (!leaveMore && arr.size()!=this->size()) || arr.size() > this->size() )
-		{
-			free();
-			this->_onGPU = moveToGPU;
-			resize(arr.size());
-		}
-		else move(moveToGPU);
-		this->w=arr.w; this->h=arr.h; this->d=arr.d;
-		this->copy(arr);
-	}
-	void clone(const Array<T> & arr, bool leaveMore = false){clone(arr, leaveMore, arr._onGPU);}
-	~Array(){ if (!_bindedFrom) free(); }
-
-	void move(bool toGPU)
-	{
-#if USE_CUDA
-		if (this->_onGPU == toGPU) return;
-		T* newData = Alloc(this->size(), toGPU);
-		cudaMemcpy(newData, this->data, this->size()* sizeof(T), toGPU? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost);
-		if (this->data){
-			if (this->_onGPU) cudaFree(this->data);
-			else delete[] this->data;
-		}
-		this->data = newData;
-		this->_onGPU = toGPU;
-#else
-		assert(false); //CPU only version, what do you want to move?
-#endif
-	}
-	void moveToGPU(){move(true);}
-	void moveToCPU(){move(false);}
-
-	void print(const char * name = 0)
-	{
-		Array<T> aux;
-		ArrayBase<T> vec = *this;
-		if (_onGPU) { aux.clone(*this, false, false); vec = aux; }
-		printf("Printing vector %s\n", (name) ? name : "Noname");
-		for (int i = 0; i < this->size(); i++)
-			printf("%d:%.10f, ", i, (double)vec.data[i]);
-		printf("\n\n");
-		fflush(stdout);
-	}
-
-	/*T norm()
-	{
-		T res = 0;
-		#pragma omp parallel for reduction(+:res) schedule(static)
-		for (int i = 0; i < _size; i++) res+=square(_data[i]);
-		return sqrt(res);
-	}
-	
-
-	void add(const Array<T> & vec, T mult)
-	{
-		Add(*this, vec, 1.0, mult, *this);
-	}
-
-	static void Add(const Array<T> & a, const Array<T> & b, const T aMult, const T bMult, Array<T> & res)
-	{
-		if (a.size()!=b.size() || b.size() > res.size()) throw "Array::Add - array sizes differ";
-		#pragma omp parallel for schedule(static)
-		for (int i = 0; i < res.size(); i++)
-			res[i]=aMult*a[i]+bMult*b[i];
-	}
-	void Subtract(const Array<T> & a, const Array<T> & b, Array<T> & res)
-	{
-		if (a.size()!=b.size() || b.size() > res.size()) throw "Array::Subtract - array sizes differ";
-		#pragma omp parallel for schedule(static)
-		for (int i = 0; i < b.size(); i++)
-			res[i]=a[i]-b[i];
-	}*/
-};
-
-typedef Array<double> ArrayD;
-typedef Array<int> ArrayI;
-
-template <class T>
-class Array2D: public Array<T>
-{
-public:
-	void resize2d(int width, int height){if (width==this->w && height==this->h) return; this->resize(width*height);
-		this->w=width; this->h=height; this->d=1;}
-	void bind2d(Array<T> & arr, int offset, int width, int height){
-		this->unbindOrFree();
-		this->set(&arr.data[offset], width, height, 1); 
-		this->_bindedFrom=&arr;}
-	void bind3d(Array<T> & arr, int offset, int width, int height, int depth){this->unbindOrFree(); 
-		this->set(&arr[offset], width, height, depth); this->_bidnFrom=&arr;}
-	void clone(const Array2D<T>& arr)
-	{
-		Array<T>::clone(arr);
-		this->w = arr.width(); this->h=arr.height();
-	}
-	void fillBorders(T val){
-		if (this->_onGPU) throw "FillBorders isn't implemented on GPU yet.";
-		for (int x=0; x < this->w;   x++) this->data[x] = this->data[(this->h-1)*this->w + x] = val;
-		for (int y=1; y < this->h-1; y++) this->data[y*this->w] = this->data[y*this->w + this->w-1] = val;
-	}
-};
-typedef Array2D<double> arr2D;
-
-
-struct GPUMatrix
-{
-	int _num_rows, _num_cols;
-	ArrayBaseD _vals;
-	ArrayBase<int> _cols, _rowStarts;
-
-	__cuda_call__ int num_rows() const { return _num_rows; }
-	__cuda_call__ int num_cols() const { return _num_cols; }
-	__cuda_call__ int num_in_row(int row) const { return _rowStarts[row + 1] - _rowStarts[row]; }
-	__cuda_call__ inline void get_el_in_row(int row, int ind_in_row, double & out_val, int & out_col) const {
-		int i = _rowStarts[row] + ind_in_row; out_val = _vals[i]; out_col = _cols[i];
-	}
-	inline double& get_val_in_row(int row, int ind_in_row){ return _vals[_rowStarts[row] + ind_in_row]; }
-	__cuda_call__ inline double  get_val_in_row(int row, int ind_in_row) const { return _vals[_rowStarts[row] + ind_in_row]; }
-	__cuda_call__ inline int get_col_index(int row, int ind_in_row) const { return _cols[_rowStarts[row] + ind_in_row]; }
-	__cuda_call__ double get_diag(int row) const
-	{
-		for (int i = _rowStarts[row]; i < _rowStarts[row + 1]; i++)
-			if (_cols[i] == row) return _vals[i];
-		return -1e100;
-		//throw "Diagonal element not found";
-	}
-};
-
-class MatrixCSR
-{
-public:
-	ArrayD _vals;
-	ArrayI _cols;
-	ArrayI _rowStarts;
-	int _num_rows, _num_cols;
-
-	MatrixCSR(){_num_cols=_num_rows=0;}
-	MatrixCSR(const MatrixCSR & mat){throw "Copy constructor for MatrixCSR doesn't exist";}
-	__cuda_call__ int num_rows() const { return _num_rows; }
-	__cuda_call__ int num_cols() const { return _num_cols; }
-	__cuda_call__ int num_in_row(int row) const { return _rowStarts[row + 1] - _rowStarts[row]; }
-	GPUMatrix toKernel() const
-	{
-		assert(_vals.onGPU());
-		GPUMatrix res;
-		res._num_rows = _num_rows; res._num_cols = _num_cols;
-		res._rowStarts = _rowStarts;
-		res._cols = _cols; res._vals = _vals;
-		return res;
-	}
-	void resize(int num_rows, int num_cols, int num_values)
-	{
-		if (_vals.size()!=num_values)
-		{
-			_vals.resize(num_values);
-			_cols.resize(num_values);
-		}
-		if (_rowStarts.size()!=num_rows+1) _rowStarts.resize(num_rows+1);
-		_rowStarts.fill(0);
-		_num_rows=num_rows;
-		_num_cols=num_cols;
-	}
-	void clear()
-	{
-		_vals.fill(0);
-		_cols.fill(0);
-		_rowStarts.fill(0);
-	}
-
-	template <class MatrixType>
-	void copyVals(const MatrixType & matToClone)
-	{
-		for (int ri = 0; ri < _num_rows; ri++)
-		{
-			int nr = num_in_row(ri), rs = _rowStarts[ri];
-			for (int i = 0; i < nr; i++)
-				_vals[rs + i] = matToClone.get_val_in_row(ri, i);
-		}
-	}
-
-	template <class MatrixType>
-	void clone(const MatrixType & matToClone)
-	{
-		_num_rows = matToClone.num_rows();
-		_num_cols = matToClone.num_cols();
-		_rowStarts.resize(_num_rows+1);
-		_rowStarts[0] = 0;
-		for (int ri = 0; ri < _num_rows; ri++)
-			_rowStarts[ri+1] = _rowStarts[ri] + matToClone.num_in_row(ri);
-
-		int nne = _rowStarts[_num_rows];
-		_vals.resize(nne);
-		_cols.resize(nne);
-		for (int ri = 0; ri < _num_rows; ri++)
-		{
-			int nr = matToClone.num_in_row(ri), rs = _rowStarts[ri];
-			for (int i = 0; i < nr; i++)
-				matToClone.get_el_in_row(ri,i,_vals[rs+i],_cols[rs+i]);
-		}
-	}
-
-	inline double& operator()(int ri, int ci)
-	{
-		if(ri>=_num_rows || ci>=_num_cols) throw "MatrixCSR - Index out of bounds";
-		for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++ )
-			if (_cols[i] == ci) return _vals[i];
-		throw "MatrixCSR - Value not found";
-	}
-	inline const double& operator()(int ri, int ci) const
-	{
-		if(ri>=_num_rows || ci>=_num_cols) throw "MatrixCSR - Index out of bounds";
-		for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++ )
-			if (_cols[i] == ci) return _vals[i];
-		throw "MatrixCSR - Value not found";
-	}
-	inline bool isNull(int ri, int ci) const
-	{
-		if(ri>=_num_rows || ci>=_num_cols) throw "MatrixCSR - Index out of bounds";
-		for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++ )
-			if (_cols[i] == ci) return false;
-		return true;
-	}
-	__cuda_call__ inline void get_el_in_row(int row, int ind_in_row, double & out_val, int & out_col) const {
-		int i = _rowStarts[row] + ind_in_row; out_val = _vals[i]; out_col=_cols[i];
-	}
-	inline double& get_val_in_row(int row, int ind_in_row){return _vals[_rowStarts[row] + ind_in_row];}
-	__cuda_call__ inline double  get_val_in_row(int row, int ind_in_row) const { return _vals[_rowStarts[row] + ind_in_row]; }
-	__cuda_call__ inline int get_col_index(int row, int ind_in_row) const { return _cols[_rowStarts[row] + ind_in_row]; }
-	__cuda_call__ double get_diag(int row) const
-	{ 
-		for (int i = _rowStarts[row]; i < _rowStarts[row + 1]; i++)
-			if (_cols[i] == row) return _vals[i];
-		return -1e100;
-		//throw "Diagonal element not found";
-	}
-	void loadMMMatrix(const char * filename);
-	void mult(const double * vec, double *out) const
-	{
-		#pragma omp parallel for schedule(static)
-		for (int ri = 0; ri < _num_rows; ri++)
-		{
-			out[ri] = 0;
-			for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++)
-				out[ri] += _vals[i] * vec[_cols[i]];
-		}
-	}
-	void mult(const ArrayD & vec, ArrayD & out) const
-	{
-		if (out.size()< _num_rows) throw "CSRMatrix::mult - out vector not big enough";
-		if (vec.size()< _num_cols) throw "CSRMatrix::mult - vec vector not big enough";
-		#pragma omp parallel for schedule(static)
-		for (int ri = 0; ri < _num_rows; ri++)
-		{
-			out[ri] = 0;
-			for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++)
-				out[ri] += _vals[i] * vec[_cols[i]];
-		}
-	}
-
-	void print() const
-	{
-		printf("Matrix %d x %d with %d values\n", _num_rows, _num_cols, _vals.size()); fflush(stdout);
-		for (int ri = 0; ri < _num_rows; ri++)
-		{
-			printf("Row %d - ", ri);
-			for (int i = _rowStarts[ri]; i < _rowStarts[ri+1]; i++)
-				printf("%d:%f, ",_cols[i], _vals[i]);
-			printf("\n");
-		}
-		fflush(stdout);
-	}
-
-	void moveToGPU(){ _rowStarts.moveToGPU(); _cols.moveToGPU(); _vals.moveToGPU(); }
-	void moveToCPU(){ _rowStarts.moveToCPU(); _cols.moveToCPU(); _vals.moveToCPU(); }
-
-	/*static void MatrixMult(const MatrixCSR & A, const MatrixCSR & B, MatrixCSR & res)
-	{
-		if (A._num_cols != B._num_rows) throw "CSRMatrix::MatrixMult - A.cols and b.rows don't agree";
-
-		intArr aux(B.num_cols()); aux.fill(-1);
-		res._rowStarts.resize(A.num_rows());
-		for (int r = 0; r < A.num_rows(); r++)
-		{
-			int actCols=0;
-			for (int i = 0; i < A.num_in_row(r); i++)
-			{
-				int ri = A.get_col_index(r,i);
-				for (int j = 0; j < B.num_in_row(ri); j++)
-				{
-					int ci = B.get_col_index(ri,j);
-					if (aux[ci]!=r)
-					{
-						actCols++;
-						aux[ci] = r;
-					}
-				}
-			}
-			res._rowStarts[r+1] = res._rowStarts[r+1]+actCols;
-		}
-
-	#pragma omp parallel for schedule(static)
-		for (int r = 0; r < A.num_rows(); r++)
-		{
-			for (int i = 0; i < A.num_in_row(r); i++)
-			{
-				int ri = A.get_col_index(r,i);
-				for (int j = 0; j < B.num_in_row(ri); j++)
-				{
-					int ci = B.get_col_index(ri,j);
-					double val = res(r,ci);
-					res(r,ci) = val + A.get_val_in_row(r,i)*B.get_val_in_row(ri,j);
-				}
-			}
-		}
-	}*/
-};
diff --git a/src/TNL/legacy/incompressible-navier-stokes/solver.h b/src/TNL/legacy/incompressible-navier-stokes/solver.h
deleted file mode 100644
index b4d0403d4c1eae1175efddad2f4bbe16c54c93d1..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/solver.h
+++ /dev/null
@@ -1,523 +0,0 @@
-#include <chrono>
-
-#if 1
- #include "base.h"
-#else
- #include "lin_alg.h"
-#endif
-#include <assert.h>
-
-/*Vytvoreno dle:
-http://www.leb.eei.uni-erlangen.de/winterakademie/2008/report/content/course01/pdf/0105.pdf
-http://math.mit.edu/~gs/cse/codes/mit18086_navierstokes.pdf
-
-*/
-
-__cuda_call__ inline double HorAvg (const ArrayBaseD & arr, int x, int y) { return 0.5*(arr(x,y) + arr(x+1,y)); }
-__cuda_call__ inline double VerAvg (const ArrayBaseD & arr, int x, int y) { return 0.5*(arr(x,y) + arr(x,y+1)); }
-__cuda_call__ inline double HorDiff(const ArrayBaseD & arr, int x, int y) { return arr(x+1,y) - arr(x,y); }
-__cuda_call__ inline double VerDiff(const ArrayBaseD & arr, int x, int y) { return arr(x,y+1) - arr(x,y); }
-__cuda_call__ inline bool IsBoundary(int x, int y, int w, int h){return x==0||y==0||x==w-1||y==h-1;}
-__cuda_call__ inline bool IsBoundary(const ArrayBaseD & arr, int x, int y){ return x==0 || y==0 || x==arr.width()-1 || y==arr.height()-1;}
-__cuda_call__ inline bool IsOut(const ArrayBaseD & arr, int x, int y){ return x < 0 || y < 0 || x >= arr.width() || y >= arr.height(); }
-
-class RegularMesh
-{
-	int N;
-	bool centerIsBoundary(int x, int y){ return x == 0 || y == 0 || x == N - 1 || y == N - 1; }
-	bool verFaceIsBoundary(int x, int y){ return x == 0 || y == 0 || x == N || y == N - 1; }
-	bool horFaceIsBoundary(int x, int y){ return x == 0 || y == 0 || x == N -1 || y == N; }
-	int center(int x, int y){return y*N + x;}
-	int verFace(int x, int y){ return y*(N + 1) + x; }
-	int horFace(int x, int y){ return y*N + x; }
-	int numCenters() { return N*N; }
-	int numVerFaces() { return (N + 1)*N; }
-	int numHorFaces() { return N*(N + 1); }
-
-	vec2i verFaceLeftToCenter(int x, int y){ assert(x >= 0);  return vec2i(x, y); }
-	vec2i verFaceRightToCenter(int x, int y){ assert(x <  N);  return vec2i(x + 1, y); }
-	vec2i horFaceUpToCenter(int x, int y){ assert(y >= 0);  return vec2i(x, y); }
-	vec2i horFaceDownToCenter(int x, int y){ assert(y <  N);  return vec2i(x, y+1); }
-};
-
-class EmptyMatrix
-{
-	__cuda_call__ int num_rows() const {return 0;}
-	__cuda_call__ int num_cols() const {return 0;}
-	__cuda_call__ int num_in_row(int row) const {return 0;}
-	__cuda_call__ void get_el_in_row(int row, int ind_in_row, double & val, int &col) const {val=0; col=-1;}
-	__cuda_call__ double get_diag(int row) const {return 0;}
-	EmptyMatrix& toKernel() { return *this; }
-	const EmptyMatrix& toKernel() const { return *this; }
-};
-
-class IdentityMatrix
-{
-	int _size;
-public:
-	IdentityMatrix() :_size(0){}
-	IdentityMatrix(int size) : _size(size){}
-	__cuda_call__ int num_rows() const { return _size; }
-	__cuda_call__ int num_cols() const { return _size; }
-	__cuda_call__ int num_in_row(int row) const { return 1; }
-	__cuda_call__ void get_el_in_row(int row, int ind_in_row, double & val, int &col) const { val = 1; col = row; }
-	__cuda_call__ double get_diag(int row) const { return 1; }
-	IdentityMatrix& toKernel() { return *this; }
-	const IdentityMatrix& toKernel() const { return *this; }
-};
-
-class SimpleMatrix2D
-{
-	ArrayBaseD _var;
-	double _diag;
-	double _off;
-public:
-	SimpleMatrix2D(){_diag=1;_off=0;}
-	SimpleMatrix2D(Array2D<double>& var, double diag, double off){set(var,diag,off);}
-	void set(Array2D<double>& var, double diag, double off){_var.set(var); _diag=diag; _off=off;}
-	SimpleMatrix2D& toKernel() { return *this; }
-	const SimpleMatrix2D& toKernel() const { return *this; }
-	__cuda_call__ int num_rows() const {return _var.size();}
-	__cuda_call__ int num_cols() const {return _var.size();}
-	__cuda_call__ int num_in_row(int row) const {
-		vec2i coord = _var.index2D(row);
-		return IsBoundary(_var, coord.x, coord.y)? 1 : 5;
-	}
-	__cuda_call__ void get_el_in_row(int row, int ind_in_row, double & val, int &col) const {
-		vec2i coord = _var.index2D(row);
-		//if (IsBoundary(_var, coord.x, coord.y)){ col = row; val = 1; return; }
-		int x=coord.x, y=coord.y, w = _var.width();
-		{
-			if (x==0) {col = row+1; val = 1; return;}
-			else if (y==0) {col = row+w; val = 1; return;}
-			else if (x==_var.width()-1 ) {col = row-1; val = 1; return;}
-			else if (y==_var.height()-1) {col = row-w; val = 1; return;}
-		}
-		switch(ind_in_row)
-		{
-		case 0: val = _diag; col = row; break;
-		case 1: val = _off;  col = row-1; break;
-		case 2: val = _off;  col = row+1; break;
-		case 3: val = _off;  col = row-_var.width(); break;
-		case 4: val = _off;  col = row+_var.width(); break;
-		}
-	}
-};
-
-class AdvectDiffusionMatrix2D
-{
-public:
-	ArrayBaseD u, v;
-	double visc, dt;
-
-	AdvectDiffusionMatrix2D(){ visc = dt = 0; }
-	AdvectDiffusionMatrix2D(Array2D<double> &u, Array2D<double> &v, double visc, double dt){set(u,v,visc,dt);}
-	void set(Array2D<double> &u, Array2D<double> &v, double visc, double dt)
-	{
-		this->u.set(u); this->v.set(v); this->visc=visc; this->dt = dt;
-	}
-	AdvectDiffusionMatrix2D& toKernel() { return *this; }
-	const AdvectDiffusionMatrix2D& toKernel() const { return *this; }
-
-	__cuda_call__ int num_rows() const {return u.size()+v.size();}
-	__cuda_call__ int num_cols() const {return u.size()+v.size();}
-	__cuda_call__ int num_in_row(int row) const {
-		const ArrayBaseD *act = row>=u.size()? &v : &u;
-		vec2i coord = act->index2D(row - (row>=u.size()? u.size() : 0));
-		return IsBoundary(*act, coord.x, coord.y)? 1 : 5;
-	}
-	__cuda_call__ void get_el_in_row(int row, int ind_in_row, double & val, int &col) const
-	{
-		const ArrayBaseD *act = row>=u.size()? &v : &u;
-		vec2i coord = act->index2D(row - (row>=u.size()? u.size() : 0));
-		int x=coord.x, y=coord.y, w = act->width();
-		if (IsBoundary(*act,x,y)) {col = row; val = 1; return;}
-
-		const double dx = 1.0/u.height(), dy=dx, vix = dt*visc/(dx*dx), viy=dt*visc/(dy*dy);
-		double cxm=0,cym=0,cxp=0,cyp=0;
-		if (act==&u)
-		{
-			cxm = -0.25*HorAvg(u,x-1,y)/dx; cxp = 0.25*HorAvg(u,x,y)/dx;
-			cym = -0.25*HorAvg(v,x-1,y)/dy; cyp = 0.25*HorAvg(v,x-1,y+1)/dy;
-		}
-		else
-		{
-			cxm = -0.25*VerAvg(u,x,y-1)/dx; cxp = 0.25*VerAvg(u,x+1,y-1)/dx;
-			cym = -0.25*VerAvg(v,x,y-1)/dy; cyp = 0.25*VerAvg(v,x,y)/dy;
-		}
-		switch(ind_in_row)
-		{
-		case 0: val = 1+dt*(cxm+cxp+cym+cyp)+2*vix+2*viy; col = row; break; //Diagonal element
-		case 1: val = dt*cxm-vix; col = row-1; break;
-		case 2: val = dt*cxp-vix; col = row+1; break;
-		case 3: val = dt*cym-viy; col = row-w; break;
-		case 4: val = dt*cyp-viy; col = row+w; break;
-		case 10: val = 1+2*dt*(cxm+cxp+cym+cyp); col =row; break; //special number for sum of whole row
-		}
-	}
-	__cuda_call__ double get_val_in_row(int row, int ind_in_row) const{
-		double val; int col;
-		get_el_in_row(row, ind_in_row, val, col);
-		return val;
-	}
-	__cuda_call__ double get_diag(int row) const
-	{
-		double val; int col;
-		get_el_in_row(row, 0, val, col);
-		return val;
-	}
-};
-
-class AdvectModifPoisson
-{
-	const ArrayBaseD *u,*v, *p;
-	const AdvectDiffusionMatrix2D * adMat;
-public:
-	AdvectModifPoisson(Array2D<double> *p, const AdvectDiffusionMatrix2D * adMat){
-		this->p = p;
-		this->adMat = adMat;
-		this->u = &(adMat->u);
-		this->v = &(adMat->v);
-	}
-	__cuda_call__ int num_rows() const {return p->width()*p->height();}
-	__cuda_call__ int num_cols() const {return p->width()*p->height();}
-	__cuda_call__ int num_in_row(int row) const {
-		vec2i coord = p->index2D(row - (row>=u->size()? u->size() : 0));
-		return IsBoundary(*p, coord.x, coord.y)? 1 : 5;
-	}
-	__cuda_call__ void get_el_in_row(int row, int ind_in_row, double & val, int &col) const
-	{
-
-		vec2i coord = p->index2D(row - (row>=u->size()? u->size() : 0));
-		int x=coord.x, y=coord.y, w = p->width();
-		{
-			if (x==0) {col = row+1; val = 1; return;}
-			else if (y==0) {col = row+w; val = 1; return;}
-			else if (x==w-1) {col = row-1; val = 1; return;}
-			else if (y==p->height()-1) {col = row-w; val = 1; return;}
-		}
-
-		const int elemInd = 0;
-		switch(ind_in_row)
-		{
-		case 0:
-			val = adMat->get_val_in_row( u->index(x-1,y-1), elemInd) + adMat->get_val_in_row( u->index(x,  y-1), elemInd) +
-				  adMat->get_val_in_row( v->index(x-1,y-1), elemInd) + adMat->get_val_in_row( v->index(x-1,y  ), elemInd);
-			col = row;
-			break;
-		case 1: val = -adMat->get_val_in_row( u->index(x-1,y-1), elemInd); col = row-1; break;
-		case 2: val = -adMat->get_val_in_row( u->index(x,  y-1), elemInd); col = row+1; break;
-		case 3: val = -adMat->get_val_in_row( v->index(x-1,y-1), elemInd); col = row-w; break;
-		case 4: val = -adMat->get_val_in_row( v->index(x-1,y  ), elemInd); col = row+w; break;
-		}
-	}
-};
-
-#if USE_CUDA
-__global__ void GPU_set_zero_neumann(ArrayBaseD a)
-{
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  int ex = a.w-1, ey=a.h-1;
-  if (i < a.w-1) {a(i,0) = a(i,1); a(i,ey) = a(i,ey-1);}
-  if (i < a.h-1) {a(0,i) = a(1,i); a(ex,i) = a(ex-1,i);}
-  if (i==0)
-  {
-	  a(0,0)=a(1,1);
-	  a(ex,0)=a(ex-1,1);
-	  a(0,ey)=a(1,ey-1);
-	  a(ex,ey)=a(ex-1,ey-1);
-  }
-}
-
-__global__ void GPU_set_bnd(ArrayBaseD a, int type) //type is same as enum vars 0=var_u, 1=var_v ...
-{
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  int ex = a.w-1, ey=a.h-1;
-  double top = type==0? 0.05 : 0;
-  if (i < a.w-1) {a(i,0) = 0; a(i,ey) = top;}
-  if (i < a.h-1) {a(0,i) = 0; a(ex,i) = 0;}
-  if (i==0)
-  {
-	  a(0,0) = a(ex,0) = a(0,ey) = a(ex,ey) = 0;
-  }
-}
-
-template <class MatrixType>
-__global__ void GPU_pressure_correction(const int dir, double mult, const ArrayBaseD var, MatrixType mat, int indOff, ArrayBaseD res) //dir is 0 for X, or 1 for Y
-{
-	int x = blockIdx.x*blockDim.x + threadIdx.x, y = blockIdx.y*blockDim.y + threadIdx.y;
-	if (IsBoundary(var, x, y)) return;
-
-	double aux = 0;
-	if (dir == 0)
-		aux += mult*(var(x+1,y+1) - var(x,y+1));
-	else
-		aux += mult*(var(x+1,y+1) - var(x+1,y));
-	int ind = res.index(x,y);
-	if (mat.num_rows() > 0) aux /= mat.get_diag(ind+indOff);
-	res[ind] += aux;
-}
-template <class MatrixType>
-void Pressure_correction_GPU(double mult, Array2D<double> &u, Array2D<double> &v, Array2D<double> &p,  MatrixType mat, Array<double> & res)
-{
-	ArrayBaseD pu, pv;
-	pu.set(res.data, u.w, u.h, u.d);
-	int uOff = u.size();
-	pv.set(&(res.data[uOff]), u.w, u.h, u.d);
-	GPU_pressure_correction<MatrixType> <<< gridSize2D(u.w, u.h), blockSize2D >>> (0, mult, pu, mat, 0, res);
-	GPU_pressure_correction<MatrixType> <<< gridSize2D(v.w, v.h), blockSize2D >>> (0, mult, pv, mat, uOff, res);
-}
-
-__global__ void GPU_calc_divergence(const ArrayBaseD u, const ArrayBaseD v, ArrayBaseD res, int N)
-{
-	int x = blockIdx.x*blockDim.x + threadIdx.x, y = blockIdx.y*blockDim.y + threadIdx.y;
-	if (IsOut(res, x, y)) return;
-	if (IsBoundary(res, x, y)) return;
-	res(x, y) = -0.5f*(u(x, y - 1) - u(x - 1, y - 1) + v(x - 1, y) - v(x - 1, y - 1)) / N; // -(u_x + v_y)
-}
-
-template <class MatrixType>
-__global__ void GPU_pressure_correction_u_part(const MatrixType mat, const ArrayBaseD u, const ArrayBaseD p, ArrayBaseD res, int N, int sign)
-{
-	int x = blockIdx.x*blockDim.x + threadIdx.x, y = blockIdx.y*blockDim.y + threadIdx.y;
-	if (IsOut(u, x, y)) return;
-	if (IsBoundary(u, x, y)) return;
-
-	int ind = u.index(x, y);
-	res[ind] += sign*0.5f*N*(p(x + 1, y + 1) - p(x, y + 1)) / mat.get_diag(ind);
-}
-template <class MatrixType>
-__global__ void GPU_pressure_correction_v_part(const MatrixType mat, const ArrayBaseD v, const ArrayBaseD p, ArrayBaseD res, int N, int sign, int vOff)
-{
-	int x = blockIdx.x*blockDim.x + threadIdx.x, y = blockIdx.y*blockDim.y + threadIdx.y;
-	if (IsOut(v, x, y)) return;
-	if (IsBoundary(v, x, y)) return;
-
-	int ind = v.index(x, y) + vOff;
-	res[ind] += sign*0.5f*N*(p(x + 1, y + 1) - p(x + 1, y)) / mat.get_diag(ind);
-
-}
-#endif //USE_CUDA
-
-void Calc_divergence(const Array2D<double> &u, const Array2D<double> &v, Array2D<double> &res, int N)
-{
-	assert(u.onGPU() == v.onGPU());
-	assert(u.onGPU() == res.onGPU());
-#if USE_CUDA
-	if (res.onGPU())
-	{
-		GPU_calc_divergence <<< gridSize2D(res.w, res.h), blockSize2D >>> (u, v, res, N);
-	}
-	else
-#endif
-	{
-		for (int x = 1; x <= N; x++) for (int y = 1; y <= N; y++) {
-			res(x, y) = -0.5f*(u(x, y - 1) - u(x - 1, y - 1) + v(x - 1, y) - v(x - 1, y - 1)) / N; // -(u_x + v_y)
-		}
-	}
-}
-
-template <class MatrixType>
-static void pressureCorrectionWithA(const Array2D<double> & u, const Array2D<double> & v, const Array2D<double> & p, Array<double> & arr, int sign,
-	const MatrixType & mat)
-{
-	assert(arr.onGPU() == p.onGPU());
-	int N = u.height(), vOff = u.size();
-#if USE_CUDA
-	if (arr.onGPU())
-	{
-		GPU_pressure_correction_u_part <<< gridSize2D(u.w, u.h), blockSize2D >>> (mat.toKernel(), u, p, arr, N, sign);
-		GPU_pressure_correction_v_part <<< gridSize2D(v.w, v.h), blockSize2D >>> (mat.toKernel(), v, p, arr, N, sign, vOff);
-		return;
-	}
-#endif
-	
-	for (int x = 1; x< u.width() - 1; x++) for (int y = 1; y< u.height() - 1; y++)
-	{
-		int ind = u.index(x, y);
-		arr[ind] += sign*0.5f*N*(p(x,y)-p(x-1,y))/mat.get_diag(ind);
-	}
-	for (int x = 1; x< v.width() - 1; x++) for (int y = 1; y< v.height() - 1; y++)
-	{
-		int ind = v.index(x, y) + vOff;
-		arr[ind] += sign*0.5f*N*(p(x,y)-p(x,y-1))/mat.get_diag(ind);
-	}
-}
-
-class NSSolver
-{
-public:
-	enum vars {var_u, var_v, var_p, var_d};
-	int N;
-	//double diff, visc;
-
-	Array<double> vels, vels0, aux, b;
-	Array2D<double> u, u0, v, v0, p, p0, pd, pd0;
-	AdvectDiffusionMatrix2D advectNoMat;
-	SimpleMatrix2D poissNoMat;
-	MatrixCSR advectMat, poissMat;
-
-	NSSolver()
-	{
-		N=0;
-	}
-
-	void init(int size)
-	{
-		N = size;
-		vels.resize(2*N*(N+1)); //for u,v
-		vels0.clone(vels);
-		aux.clone(vels);
-		b.clone(vels); b.fill(0);
-
-		p.resize2d(N,N); p.fill(0); p0.clone(p);
-		pd.clone(p); pd0.clone(pd);
-
-		rebind();
-		reset();
-		advectMat.clone(advectNoMat);
-		poissMat.clone(poissNoMat);
-	}
-
-	void rebind()
-	{
-		u.bind2d(vels, 0, N + 1, N);  v.bind2d(vels, N*(N + 1), N, N + 1);
-		u0.bind2d(vels0, 0, N + 1, N); v0.bind2d(vels0, N*(N + 1), N, N + 1);
-		advectNoMat.set(u, v, 0, 0);
-		poissNoMat.set(p, 4, -1);
-	}
-
-	void reset()
-	{
-		u.fill(0); u0.fill(0); v.fill(0); v0.fill(0); p.fill(0); p0.fill(0);
-		set_bnd(var_u, u); set_bnd(var_u, u0);
-		set_bnd(var_v, v); set_bnd(var_v, v0);
-		set_bnd(var_p, p); set_bnd(var_p, p0);
-	}
-
-	static void set_zero_neumann(arr2D & a)
-	{
-		int ex = a.width()-1, ey=a.height()-1;
-		for (int x=1; x < ex; x++) {a(x,0) = a(x,1); a(x,ey) = a(x,ey-1);}
-		for (int y=1; y < ey; y++) {a(0,y) = a(1,y); a(ex,y) = a(ex-1,y);}
-		a(0,0)=a(1,1);
-		a(ex,0)=a(ex-1,1);
-		a(0,ey)=a(1,ey-1);
-		a(ex,ey)=a(ex-1,ey-1);
-	}
-
-	static void set_bnd ( vars b, arr2D & x )
-	{
-#if USE_CUDA
-		if (x.onGPU())
-		{
-			int gs = gridSize(max(x.width(), x.height()));
-			if (b==var_p)
-				GPU_set_zero_neumann <<< gs, blockSize >>> (x);
-			else
-				GPU_set_bnd <<< gs, blockSize >>> (x, (int) b);
-		}
-		else
-#endif
-		{
-			if (b==var_p) {set_zero_neumann(x); return;}
-			x.fillBorders(0);
-			if (b==var_u) for (int i=1 ; i<x.width()-1 ; i++ ) {x(i,x.height()-1) = 0.05;}
-		}
-	}
-
-	template <class MatrixType>
-	void calcPressure(Array2D<double> & u, Array2D<double> & v, Array2D<double> & p, Array2D<double> & p0, const MatrixType &adMat)
-	{
-		p0.fill(0); p.fill(0);
-		int N = u.height();
-		Calc_divergence(u, v, p0, N);
-		set_bnd ( var_p, p0 );  //zero neumann bnd. cond.
-
-		for (int i = 0; i < 20; i++)
-		{
-			JacobiIter(poissNoMat,p0,p,aux); JacobiIter(poissNoMat,p0,aux,p);
-			set_bnd ( var_p, p );
-		}
-		CudaCheckError();
-
-		/*for (int i = 0; i < p.size(); i++)
-		{
-			double val = p[i];
-			if (val != p[i])
-				throw "Error: There is a NaN in pressure.";
-		}*/
-	}
-	/*static void pressureCorrection(const Array2D<double> & u, const Array2D<double> & v, const Array2D<double> & p, Array<double> & arr)
-	{
-		int N = u.height(), vOff = u.size();
-		for ( int x=1 ; x< u.width()-1 ; x++ ) for (int y=1 ; y< u.height()-1 ; y++ )
-			arr[u.index(x,y)] -= 0.5f*N*(p(x+1,y+1)-p(x,y+1));
-		for ( int x=1 ; x< v.width()-1 ; x++ ) for (int y=1 ; y< v.height()-1 ; y++ )
-			arr[v.index(x,y)+vOff] -= 0.5f*N*(p(x+1,y+1)-p(x+1,y));
-	}*/
-	
-	/*static void pressureCorrectionWithA2(const Array2D<double> & u, const Array2D<double> & v, const Array2D<double> & p, Array<double> & arr, double sign,
-								   const AdvectDiffusionMatrix2D mat)
-	{
-		int N = u.height(), vOff = u.size();
-		for ( int x=1 ; x< u.width()-1 ; x++ ) for (int y=1 ; y< u.height()-1 ; y++ )
-		{
-			int ind = u.index(x,y);
-			arr[ind] += sign*0.5f*N*(p(x+1,y+1)-p(x,y+1))/mat.get_val_in_row(ind, 10);
-		}
-		for ( int x=1 ; x< v.width()-1 ; x++ ) for (int y=1 ; y< v.height()-1 ; y++ )
-		{
-			int ind = v.index(x,y)+vOff;
-			arr[ind] += sign*0.5f*N*(p(x+1,y+1)-p(x+1,y))/mat.get_val_in_row(ind, 10);
-		}
-	}*/
-
-	static void createRHS(const Array<double> & vels0, const Array2D<double> & u, const Array2D<double> & v, const Array2D<double> & p, Array<double> & b)
-	{
-		b.copy(vels0);
-		pressureCorrectionWithA(u, v, p, b, -1, IdentityMatrix(b.size()));
-		//pressureCorrection(u,v,p,b);
-	}
-
-	void prepareAdvectMat(double visc, double dt)
-	{
-		advectNoMat.set(u, v, visc, dt);
-		advectMat.copyVals(advectNoMat);
-	}
-
-	void solveAdvectMat(int iter, double damping)
-	{
-		for (int i = 0; i < iter; i++)
-		{
-			JacobiIter(advectMat, b, vels, aux, damping);
-			JacobiIter(advectMat, b, aux, vels, damping);
-
-		}
-	}
-
-	void simulate_velocity(double visc, double dt)
-	{
-		vels0.copy(vels); p0.copy(p);
-		createRHS(vels0, u, v, p, b);
-		double residuum = 1e10;
-		int count=0;
-		prepareAdvectMat(visc,dt);
-		auto & matToUse = advectMat;
-
-		for (int i = 0; i < 10; i++)
-		{
-
-			solveAdvectMat(5, 0.7);
-			set_bnd ( var_u, u ); set_bnd ( var_v, v );
-			pressureCorrectionWithA(u, v, p, vels, 1, matToUse);
-			calcPressure(u, v, p, p0, matToUse);
-			pressureCorrectionWithA(u, v, p, vels, -1, matToUse);
-
-			createRHS(vels0, u, v, p, b);
-			residuum = Residuum(matToUse, b, vels);
-			CudaCheckError();
-			count++;
-		}
-	}
-};
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cpp b/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cpp
deleted file mode 100644
index ce15dfa47e129b953d2f9466e416a83047ce148b..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/***************************************************************************
-                          tnl-incompressible-navier-stokes.cpp  -  description
-                             -------------------
-    begin                : Jan 28, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "tnl-incompressible-navier-stokes.h"
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cu b/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cu
deleted file mode 100644
index b5cf257ca907bb0e375c8d78da55712514dceb15..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/***************************************************************************
-                          tnl-incompressible-navier-stokes.cu  -  description
-                             -------------------
-    begin                : Jan 28, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "tnl-incompressible-navier-stokes.h"
\ No newline at end of file
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.h b/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.h
deleted file mode 100644
index 4e7edefc3aa62cf45defd35a08ec3503371445d1..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnl-incompressible-navier-stokes.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/***************************************************************************
-                          tnl-incompressible-navier-stokes.h  -  description
-                             -------------------
-    begin                : Jan 28, 2015
-    copyright            : (C) 2015 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNL_INCOMPRESSIBLE_NAVIER_STOKES_H_
-#define TNL_INCOMPRESSIBLE_NAVIER_STOKES_H_
-
-#include <solvers/tnlSolver.h>
-#include <operators/diffusion/tnlLinearDiffusion.h>
-#include "tnlIncompressibleNavierStokesProblem.h"
-#include "tnlNSFastBuildConfig.h"
-
-//typedef tnlDefaultConfigTag BuildConfig;
-typedef tnlNSFastBuildConfig BuildConfig;
-
-template< typename ConfigTag >
-class tnlIncompressibleNavierStokesConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Incompressible Navier-Stokes solver settings:" );
-		 config.addEntry< double >( "viscosity", "Viscosity of the diffusion." );
-		 config.addEntry< double >( "inletVelocity", "Maximal X velocity on the inlet." );
-
-         /*config.addEntry< tnlString >( "boundary-conditions-type", "Choose the boundary conditions type.", "dirichlet");
-            config.addEntryEnum< tnlString >( "dirichlet" );
-            config.addEntryEnum< tnlString >( "neumann" );
-
-         config.addEntry< tnlString >( "boundary-conditions-file", "File with the values of the boundary conditions.", "boundary.tnl" );
-         config.addEntry< double >( "boundary-conditions-constant", "This sets a value in case of the constant boundary conditions." );
-         config.addEntry< double >( "right-hand-side-constant", "This sets a constant value for the right-hand side.", 0.0 );
-         config.addEntry< tnlString >( "initial-condition", "File with the initial condition.", "initial.tnl");*/
-	  }
-};
-
-template< typename Mesh, typename Real = typename Mesh::RealType, typename Index = typename Mesh::IndexType >
-class tnlINSBoundaryConditions{};
-
-template< typename Mesh, typename Real = typename Mesh::RealType, typename Index = typename Mesh::IndexType >
-class tnlINSRightHandSide{};
-
-template< typename Mesh, typename Real = typename Mesh::RealType, typename Index = typename Mesh::IndexType >
-class tnlIncompressibleNavierStokes
-{
-   public:
-	  typedef Real RealType;
-	  typedef typename Mesh::DeviceType DeviceType;
-	  typedef Index IndexType;
-};
-
-
-template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter >
-class tnlIncompressibleNavierStokesSetter
-{
-public:
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   typedef tnlStaticVector< MeshType::Dimensions, Real > Vertex;
-
-   static bool run( const tnlParameterContainer& parameters )
-   {
-      enum { Dimensions = MeshType::Dimensions };
-      typedef tnlStaticVector < MeshType::Dimensions, Real > Vertex;
-
-	  typedef tnlINSBoundaryConditions< MeshType > BoundaryConditions;
-	  typedef tnlIncompressibleNavierStokes< MeshType > ApproximateOperator;
-	  typedef tnlINSRightHandSide< MeshType > RightHandSide;
-      typedef tnlIncompressibleNavierStokesProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
-      SolverStarter solverStarter;
-      return solverStarter.template run< Solver >( parameters );
-   }
-};
-
-int main( int argc, char* argv[] )
-{
-   tnlSolver< tnlIncompressibleNavierStokesSetter, tnlIncompressibleNavierStokesConfig, BuildConfig > solver;
-   if( ! solver. run( argc, argv ) )
-      return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-#endif /* TNL_INCOMPRESSIBLE_NAVIER_STOKES_H_ */
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper.h b/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper.h
deleted file mode 100644
index 3bdf51298738a2bfcc9d272547ce91cc779e7489..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/***************************************************************************
-                          tnlExplicitINSTimeStepper.h  -  description
-                             -------------------
-    begin                : Feb 17, 2015
-    copyright            : (C) 2015 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_H_
-#define EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_H_
-
-template< typename Problem,
-          typename LinearSolver >
-class tnlExplicitINSTimeStepper
-{
-   public:
-
-   typedef Problem ProblemType;
-   typedef typename Problem::RealType RealType;
-   typedef typename Problem::DeviceType DeviceType;
-   typedef typename Problem::IndexType IndexType;
-   typedef typename Problem::MeshType MeshType;
-   typedef typename ProblemType::DofVectorType DofVectorType;
-
-   tnlExplicitINSTimeStepper(): problem(0), timeStep(0) {}
-
-   static void configSetup( tnlConfigDescription& config, const tnlString& prefix = "" )
-   {
-	   config.addEntry< bool >( "verbose", "Verbose mode.", true );
-   }
-
-   bool setup( const tnlParameterContainer& parameters,
-			  const tnlString& prefix = "" )
-   {
-	   //this->verbose = parameters.getParameter< bool >( "verbose" );
-	   return true;
-   }
-
-   bool init( const MeshType& mesh )
-   {
-	   /*cout << "Setting up the linear system...";
-	   if( ! this->problem->setupLinearSystem( mesh, this->matrix ) )
-		  return false;
-	   cout << " [ OK ]" << endl;
-	   if( this->matrix.getRows() == 0 || this->matrix.getColumns() == 0 )
-	   {
-		  cerr << "The matrix for the semi-implicit time stepping was not set correctly." << endl;
-		  if( ! this->matrix.getRows() )
-			 cerr << "The matrix dimensions are set to 0 rows." << endl;
-		  if( ! this->matrix.getColumns() )
-			 cerr << "The matrix dimensions are set to 0 columns." << endl;
-		  cerr << "Please check the method 'setupLinearSystem' in your solver." << endl;
-		  return false;
-	   }
-	   if( ! this->rightHandSide.setSize( this->matrix.getRows() ) )
-		  return false;*/
-	   return true;
-   }
-
-   void setProblem( ProblemType& problem ) {this -> problem = &problem;}
-   ProblemType* getProblem() const {return this -> problem;}
-
-   bool setTimeStep( const RealType& timeStep )
-   {
-	   if( timeStep <= 0.0 )
-	   {
-		  cerr << "Time step for tnlExplicitINSTimeStepper must be positive. " << endl;
-		  return false;
-	   }
-	   this->timeStep = timeStep;
-	   return true;
-   }
-
-   const RealType& getTimeStep() const;
-
-   bool solve( const RealType& time,
-               const RealType& stopTime,
-               const MeshType& mesh,
-               DofVectorType& dofVector,
-			   DofVectorType& auxiliaryDofVector )
-   {
-	   tnlAssert( this->problem != 0, );
-	   RealType t = time;
-	   while( t < stopTime )
-	   {
-		  RealType currentTau = Min( this->timeStep, stopTime - t );
-		  currentTau = 0.005;
-
-		  this->problem->doStep(currentTau,mesh);
-
-		  t += currentTau;
-	   }
-	   return true;
-   }
-
-   bool writeEpilog( tnlLogger& logger ) const { return true; }
-
-   protected:
-
-   Problem* problem;
-   //LinearSolver _matSolver;
-   RealType timeStep;
-
-};
-
-#include "tnlExplicitINSTimeStepper_impl.h"
-
-#endif /* EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_H_ */
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper_impl.h b/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper_impl.h
deleted file mode 100644
index 5cde25f226334bd6f8427aaee025c2b672a89780..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnlExplicitINSTimeStepper_impl.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/***************************************************************************
-                          tnlExplicitINSTimeStepper_impl.h  -  description
-                             -------------------
-    begin                : Feb 17, 2015
-    copyright            : (C) 2015 by oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_IMPL_H_
-#define EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_IMPL_H_
-
-
-
-#endif /* EXAMPLES_INCOMPRESSIBLE_NAVIER_STOKES_TNLEXPLICITINSTIMESTEPPER_IMPL_H_ */
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem.h b/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem.h
deleted file mode 100644
index cb4495c2898ff346e713d1563c4fc0be1a439199..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem.h
+++ /dev/null
@@ -1,489 +0,0 @@
-/***************************************************************************
-                          tnlIncompressibleNavierStokesProblem.h  -  description
-                             -------------------
-    begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_H_
-#define TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_H_
-
-#include <mesh/tnlGrid2D.h>
-#include <problems/tnlPDEProblem.h>
-#include <operators/diffusion/tnlLinearDiffusion.h>
-#include <core/arrays/tnlStaticArray.h>
-#include <solvers/pde/tnlLinearSystemAssembler.h>
-#include <solvers/linear/stationary/tnlJacobiSolver.h>
-#include <operators/tnlAnalyticNeumannBoundaryConditions.h>
-#include <functors/tnlConstantFunction.h>
-#include <solvers/pde/tnlNoTimeDiscretisation.h>
-#include <matrices/tnlEllpackMatrix.h>
-#include "tnlExplicitINSTimeStepper.h"
-#include "solver.h"
-
-template<class T> T square(const T & val){return val*val;}
-
-template< typename Mesh,
-          typename BoundaryCondition,
-          typename RightHandSide,
-          typename DifferentialOperator >
-class tnlIncompressibleNavierStokesProblem : public tnlPDEProblem< Mesh,
-                                                                   typename DifferentialOperator::RealType,
-                                                                   typename Mesh::DeviceType,
-                                                                   typename DifferentialOperator::IndexType  >
-{
-
-public:
-	typedef typename DifferentialOperator::RealType RealType;
-	typedef typename Mesh::DeviceType DeviceType;
-	typedef typename DifferentialOperator::IndexType IndexType;
-	typedef tnlIncompressibleNavierStokesProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator > ThisType;
-	typedef tnlPDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;
-	using typename BaseType::MeshType;
-	//typedef tnlGrid<2, RealType, tnlHostDevice, IndexType> MeshType;
-	using typename BaseType::DofVectorType;
-
-	typedef tnlEllpackMatrix< RealType, tnlHost, IndexType > MatrixType;
-	typedef tnlJacobiSolver<MatrixType> LinearSolver;
-	typedef tnlExplicitINSTimeStepper< ThisType, LinearSolver > TimeStepper;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	 enum { Dimensions = Mesh::Dimensions };
-
-protected:
-	NSSolver validator;
-	RealType visc, upVelocity;
-	MatrixType poissonMat, advectDiffuseMat;
-
-	DofVectorType vel, vel0, vel_aux, vel_rhs, p, p_rhs;
-
-
-public:
-
-      static tnlString getTypeStatic() {return tnlString( "tnlNSProblem< " ) + Mesh :: getTypeStatic() + " >";}
-
-      tnlString getPrologHeader() const{return tnlString( "NS equation" );}
-
-      void writeProlog( tnlLogger& logger,
-                        const tnlParameterContainer& parameters ) const {}
-
-	  bool setup( const tnlParameterContainer& parameters ){
-		  visc = parameters.getParameter< RealType >( "viscosity" );
-		 /*if( ! this->boundaryCondition.setup( parameters, "boundary-conditions-" ) ||
-			 ! this->rightHandSide.setup( parameters, "right-hand-side-" ) )
-			return false;*/
-		 return true;
-	  }
-
-	  void preparePoisson(const MeshType& mesh, MatrixType& matrix ) const
-	  {
-		  IndexType nx = mesh.getDimensions().x(), ny = mesh.getDimensions().y(), n = nx*ny;
-		  typename MatrixType::CompressedRowsLengthsVector rowLenghts;
-		  rowLenghts.setSize(n);
-		  for (IndexType y = 0; y < ny; y++) for (IndexType x = 0; x < nx; x++)
-			  rowLenghts[mesh.getCellIndex(CoordinatesType(x,y))] = mesh.isBoundaryCell(CoordinatesType(x,y))? 1 : 5;
-		  matrix.setDimensions(n,n);
-		  matrix.setCompressedRowsLengths(rowLenghts);
-		  for (IndexType y = 0; y < ny; y++) for (IndexType x = 0; x < nx; x++)
-		  {
-			  IndexType row = mesh.getCellIndex(CoordinatesType(x,y));
-
-			  if (x==0)				{matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x+1,y)), 1.0); continue;}
-			  else if (y==0)		{matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x,y+1)), 1.0); continue;}
-			  else if (x==nx-1 )	{matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x-1,y)), 1.0); continue;}
-			  else if (y==ny-1)		{matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x,y-1)), 1.0); continue;}
-
-			  matrix.setElement(row, row, 4);
-			  matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x+1,y)), -1);
-			  matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x-1,y)), -1);
-			  matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x,y+1)), -1);
-			  matrix.setElement(row, mesh.getCellIndex(CoordinatesType(x,y-1)), -1);
-		  }
-	  }
-
-      bool setInitialCondition( const tnlParameterContainer& parameters,
-                                const MeshType& mesh,
-                                DofVectorType& dofs,
-								DofVectorType& auxDofs )
-	  {
-		  vel.setSize(mesh.getNumberOfFaces());
-		  vel0.setSize(vel.getSize());
-		  vel_aux.setSize(vel.getSize());
-		  vel_rhs.setSize(vel.getSize());
-		  p.setSize(mesh.getNumberOfCells());
-		  p_rhs.setSize(mesh.getNumberOfCells());
-		  validator.init(sqrt(mesh.getNumberOfCells()));
-
-		  vel.setValue(0); vel0.setValue(0);
-		  p.setValue(0); p_rhs.setValue(0);
-
-		  upVelocity = parameters.getParameter< RealType >( "inletVelocity" );
-		  upVelocity = 1;
-
-		  //Prepare diffusion matrix pattern
-		  typename MatrixType::CompressedRowsLengthsVector rowLenghts;
-		  rowLenghts.setSize(mesh.getNumberOfFaces());
-		  for (int i = 0; i < rowLenghts.getSize(); i++)
-			  rowLenghts[i] = num_in_row(mesh, i);
-		  advectDiffuseMat.setDimensions(mesh.getNumberOfFaces(), mesh.getNumberOfFaces());
-		  advectDiffuseMat.setCompressedRowsLengths(rowLenghts);
-
-		  preparePoisson(mesh, poissonMat);
-
-		  SetBnd(mesh);
-		  return true;
-	  }
-
-	  //template< typename MatrixType >
-	  bool setupLinearSystem( const MeshType& mesh, MatrixType& matrix ){/*NO*/}
-
-      bool makeSnapshot( const RealType& time,
-                         const IndexType& step,
-                         const MeshType& mesh,
-                         DofVectorType& dofs,
-						 DofVectorType& auxDofs )
-	  {
-		  cout << endl << "Writing output at time " << time << " step " << step << "." << endl;
-
-		  //this->bindAuxiliaryDofs( mesh, auxiliaryDofs );
-		  //cout << "dofs = " << dofs << endl;
-		  tnlString fileName;
-		  FileNameBaseNumberEnding( "u-", step, 5, ".vtk", fileName );
-		  save("test.txt", mesh);
-		  //if( ! this->solution.save( fileName ) )
-		  //   return false;
-		  return true;
-	  }
-
-	  IndexType getDofs( const MeshType& mesh ) const {return mesh.getNumberOfFaces();}
-	  void bindDofs( const MeshType& mesh, DofVectorType& dofVector ) {}
-
-	  void getExplicitRHS( const RealType& time, const RealType& tau, const MeshType& mesh, DofVectorType& _u, DofVectorType& _fu ) {/*NO*/}
-
-	  //template< typename MatrixType >
-      void assemblyLinearSystem( const RealType& time,
-                                 const RealType& tau,
-                                 const MeshType& mesh,
-                                 DofVectorType& dofs,
-                                 DofVectorType& auxDofs,
-                                 MatrixType& matrix,
-								 DofVectorType& rightHandSide ) {/*NO*/}
-
-	  void set_zero_neumann(tnlSharedVector< RealType, DeviceType, IndexType > & vec)
-	  {
-		  /*int ex = a.width()-1, ey=a.height()-1;
-		  for (int x=1; x < ex; x++) {a(x,0) = a(x,1); a(x,ey) = a(x,ey-1);}
-		  for (int y=1; y < ey; y++) {a(0,y) = a(1,y); a(ex,y) = a(ex-1,y);}
-		  a(0,0)=0.5*(a(0,1)+a(1,0));
-		  a(ex,0)=0.5*(a(ex-1,0)+a(ex,1));
-		  a(0,ey)=0.5*(a(1,ey)+a(0,ey-1));
-		  a(ex,ey)=0.5*(a(ex-1,ey)+a(ex,ey-1));*/
-	  }
-
-	  void SetBnd(const MeshType& mesh)
-	  {
-		  for (int i = 1; i < mesh.getDimensions().x(); i++)
-		  {
-			  IndexType ind = mesh.template getFaceIndex<1,0>(CoordinatesType(i, mesh.getDimensions().y() - 1));
-			  vel0[ind] = vel[ind] = 0.05;
-		  }
-	  }
-
-	  double getCenterU(const MeshType& mesh, IndexType cell) //x,y based on cells
-	  {
-		  return 0.5*(vel0[mesh.template getFaceNextToCell<-1,0>(cell)] + vel0[mesh.template getFaceNextToCell<+1,0>(cell)] );
-	  }
-	  double getCenterV(const MeshType& mesh, IndexType cell) //x,y based on cells
-	  {
-		  return 0.5*(vel0[mesh.template getFaceNextToCell<0,-1>(cell)] + vel0[mesh.template getFaceNextToCell<0,+1>(cell)] );
-	  }
-	  double getCrossU(const MeshType& mesh, int x, int y) //x,y based (n+1)*(n+1)
-	  {
-		  const CoordinatesType cellCoords(x,y);
-		  const CoordinatesType downCoords(x,y-1);
-		  return 0.5*(vel0[mesh.template getFaceNextToCell<-1,0>(mesh.getCellIndex(cellCoords))]
-					 +vel0[mesh.template getFaceNextToCell<-1,0>(mesh.getCellIndex(downCoords))]);
-	  }
-	  double getCrossV(const MeshType& mesh, int x, int y) //x,y based (n+1)*(n+1)
-	  {
-		  const CoordinatesType cellCoords(x,y);
-		  const CoordinatesType leftCoords(x-1,y);
-		  return 0.5*(vel0[mesh.template getFaceNextToCell<0,-1>(mesh.getCellIndex(cellCoords))]
-					 +vel0[mesh.template getFaceNextToCell<0,-1>(mesh.getCellIndex(leftCoords))]);
-	  }
-
-	  RealType HorAvgXFace(const MeshType& mesh,const DofVectorType & val, IndexType x, IndexType y) const
-	  {
-		IndexType i1 = mesh.template getFaceIndex<1,0>(CoordinatesType(x,y)) , i2 = mesh.template getFaceIndex<1,0>(CoordinatesType(x+1,y));
-		return 0.5*(val[i1] + val[i2]);
-	  }
-	  RealType VerAvgXFace(const MeshType& mesh,const DofVectorType & val, IndexType x, IndexType y) const
-	  {
-		IndexType i1 = mesh.template getFaceIndex<1,0>(CoordinatesType(x,y)) , i2 = mesh.template getFaceIndex<1,0>(CoordinatesType(x,y+1));
-		return 0.5*(val[i1] + val[i2]);
-	  }
-	  RealType HorAvgYFace(const MeshType& mesh,const DofVectorType & val, IndexType x, IndexType y) const
-	  {
-		IndexType i1 = mesh.template getFaceIndex<0,1>(CoordinatesType(x,y)) , i2 = mesh.template getFaceIndex<0,1>(CoordinatesType(x+1,y));
-		return 0.5*(val[i1] + val[i2]);
-	  }
-	  RealType VerAvgYFace(const MeshType& mesh,const DofVectorType & val, IndexType x, IndexType y) const
-	  {
-		IndexType i1 = mesh.template getFaceIndex<0,1>(CoordinatesType(x,y)) , i2 = mesh.template getFaceIndex<0,1>(CoordinatesType(x,y+1));
-		return 0.5*(val[i1] + val[i2]);
-	  }
-
-	  int num_in_row(const MeshType& mesh, int row) const {
-		  IndexType fx, fy;
-		  CoordinatesType coord = mesh.getFaceCoordinates(row, fx, fy);
-		  if ((fx && mesh.template isBoundaryFace<1,0>(coord)) || (fy && mesh.template isBoundaryFace<0,1>(coord)))
-			  return 1;
-		  return 5;
-	  }
-	  void get_el_in_row(const MeshType& mesh, const DofVectorType & uv, IndexType row, IndexType ind_in_row, RealType dt, RealType & val, IndexType &col) const
-	  {
-			IndexType fx, fy;
-			CoordinatesType coord = mesh.getFaceCoordinates(row, fx, fy);
-			int x = coord.x(), y = coord.y();
-			if ((fx && mesh.template isBoundaryFace<1,0>(coord)) || (fy && mesh.template isBoundaryFace<0,1>(coord)))
-				{col = row; val = 1; return;}
-
-			IndexType nx = mesh.getDimensions().x(), ny = mesh.getDimensions().y();
-			const RealType dx = 1.0/nx, dy=1.0/ny, vix = dt*visc/(dx*dx), viy=dt*visc/(dy*dy);
-			RealType cxm=0,cym=0,cxp=0,cyp=0;
-			if (fx)
-			{
-			  cxm = -0.25*HorAvgXFace(mesh, uv, x-1, y)/dx; cxp = 0.25*HorAvgXFace(mesh, uv, x, y)/dx;
-			  cym = -0.25*HorAvgYFace(mesh, uv, x-1, y)/dy; cyp = 0.25*HorAvgYFace(mesh, uv, x-1, y+1)/dy;
-			}
-			else
-			{
-			  cxm = -0.25*VerAvgXFace(mesh, uv, x, y-1)/dx; cxp = 0.25*VerAvgXFace(mesh, uv, x+1, y-1)/dx;
-			  cym = -0.25*VerAvgYFace(mesh, uv, x, y-1)/dy; cyp = 0.25*VerAvgYFace(mesh, uv, x, y)/dy;
-			}
-
-			CoordinatesType colCoord;
-			switch(ind_in_row)
-			{
-			case 0: val = 1+dt*(cxm+cxp+cym+cyp)+2*vix+2*viy; colCoord = coord; break;
-			case 1: val = dt*cxm-vix; colCoord = CoordinatesType(x-1,y); break;
-			case 2: val = dt*cxp-vix; colCoord = CoordinatesType(x+1,y); break;
-			case 3: val = dt*cym-viy; colCoord = CoordinatesType(x,y-1); break;
-			case 4: val = dt*cyp-viy; colCoord = CoordinatesType(x,y+1); break;
-			case 10: val = 1+2*dt*(cxm+cxp+cym+cyp); colCoord = coord; break; //special number for sum of whole row
-			}
-			if (fx) col = mesh.template getFaceIndex<1,0>(colCoord);
-			else	col = mesh.template getFaceIndex<0,1>(colCoord);
-	  }
-
-	  void pressureCorrectionWithA(const MeshType& mesh, DofVectorType& x, RealType sign, MatrixType* mat)
-	  {
-		  IndexType fx,fy;
-		  IndexType nx = mesh.template getNumberOfFaces< 1,0 >(), ny = mesh.template getNumberOfFaces< 0,1 >();
-		  RealType invDx = mesh.getDimensions().x(), invDy = mesh.getDimensions().y();
-		  for (int i = 0; i < nx; i++)
-		  {
-			  if (mesh.template isBoundaryFace<1,0>(mesh.getFaceCoordinates(i, fx, fy))) continue;
-			  RealType add = sign*0.5*invDx*(p[mesh.template getCellNextToFace<1,0>(i)] - p[mesh.template getCellNextToFace<-1,0>(i)]);
-			  if (mat != NULL) add /= mat->getElement(i,i);
-			  x[i] += add;
-		  }
-		  for (int i = nx; i < nx+ny; i++)
-		  {
-			  if (mesh.template isBoundaryFace<0,1>(mesh.getFaceCoordinates(i, fx, fy))) continue;
-			  RealType add = sign*0.5*invDy*(p[mesh.template getCellNextToFace<0,1>(i)] - p[mesh.template getCellNextToFace<0,-1>(i)]);
-			  if (mat != NULL) add /= mat->getElement(i,i);
-			  x[i] += add;
-		  }
-	  }
-
-	  void createRHS(const MeshType& mesh, DofVectorType& b, RealType sign)
-	  {
-		  b = vel0;
-		  pressureCorrectionWithA(mesh, b, sign, NULL);
-	  }
-
-	  static bool checkMatrices(const MatrixType& tnlMat, const MatrixCSR& myMat)
-	  {
-		  if (tnlMat.getRows() != myMat.num_rows()) throw "Different number of rows";
-		  if (tnlMat.getColumns() != myMat.num_cols()) throw "Different number of cols";
-		  for (int r = 0; r < tnlMat.getRows(); r++)
-		  {
-			  //const typename MatrixType::MatrixRow & row = tnlMat.getRow(r);
-			  //if (row.length != myMat.num_in_row(r))
-				//  throw "Different number of cells in row";
-
-			  for (int i = 0; i < myMat.num_in_row(r); i++)
-			  {
-				  int col = myMat.get_col_index(r,i), col2 = col;
-				  double val = tnlMat.getElement(r, col), val2 = myMat.get_val_in_row(r, i);
-				  if (col!=col2)
-					  throw "Column indeces are different";
-				  if (!Equal(val,val2))
-					  throw "Values are different";
-			  }
-		  }
-	  }
-	  static bool checkVectors(const DofVectorType& tnlVec, const ArrayD& myVec)
-	  {
-		  if (tnlVec.getSize() != myVec.size()) throw "Different vector size";
-		  for (int i = 0; i < myVec.size(); i++)
-		  {
-			  double a = tnlVec[i], b = myVec[i];
-			  if (!Equal(a,b))
-				  throw "Different";
-		  }
-		  return true;
-	  }
-
-	  static void JacobiIter(const MatrixType& matrix, const DofVectorType& b, DofVectorType& x, DofVectorType & aux, RealType omega)
-	  {
-		  IndexType size = matrix.getRows();
-		  for( IndexType row = 0; row < size; row ++ )
-			 matrix.performJacobiIteration( b, row, x, aux, omega );
-		  for( IndexType row = 0; row < size; row ++ )
-			 matrix.performJacobiIteration( b, row, aux, x, omega );
-	  }
-
-	  void solveAdvectMat(int iter, double omega)
-	  {
-		  for (int i = 0; i < iter; i++)
-			  JacobiIter(advectDiffuseMat, vel_rhs, vel, vel_aux, omega);
-	  }
-
-	  void prepareAdvectDiffMat(const MeshType& mesh, RealType dt)
-	  {
-		  validator.prepareAdvectMat(visc, dt);
-		  checkVectors(vel, validator.vels);
-
-		  RealType val;
-		  IndexType col;
-		  for (int row = 0; row < advectDiffuseMat.getRows(); row++)
-			  for (int i = 0; i < num_in_row(mesh, row); i++)
-			  {
-				  get_el_in_row(mesh, vel, row, i, dt, val, col);
-				  advectDiffuseMat.setElement(row, col, val);
-			  }
-		  createRHS(mesh, vel_rhs, -1);
-		  validator.createRHS(validator.vels0,validator.u, validator.v, validator.p,validator.b);
-		  vel_aux = vel;
-		  validator.aux.copy(validator.vels);
-
-		  checkMatrices(advectDiffuseMat, validator.advectMat);
-		  checkMatrices(poissonMat, validator.poissMat);
-		  checkVectors(vel_rhs, validator.b);
-		  checkVectors(vel_aux, validator.aux);
-
-		  int iter = 1; double omega = 0.7;
-		  validator.solveAdvectMat(iter, omega);
-		  solveAdvectMat(iter, omega);
-
-		  checkVectors(vel, validator.vels);
-		  iter++;
-	  }
-
-	  void doStep(RealType dt, const MeshType& mesh)
-	  {
-		prepareAdvectDiffMat(mesh, dt);
-	  }
-
-	  void computeVelocityDivergence(IndexType cell, const tnlVector<RealType, DeviceType, IndexType> & v, const MeshType& mesh, tnlVector<RealType, DeviceType, IndexType> & rhs)
-	  {
-		  double diffU = v[mesh.template getFaceNextToCell<1,0>(cell)] - v[mesh.template getFaceNextToCell<-1,0>(cell)];
-		  double diffV = v[mesh.template getFaceNextToCell<0,1>(cell)] - v[mesh.template getFaceNextToCell<0,-1>(cell)];
-		  rhs[cell] = -0.5f*(diffU/mesh.getDimensions().x() + diffV/mesh.getDimensions().y()); // -(u_x + v_y)
-	  }
-	  void updateVelocityByPressureCorrection(IndexType cell, const tnlVector<RealType, DeviceType, IndexType> & v, const MeshType& mesh, tnlVector<RealType, DeviceType, IndexType> & p)
-	  {
-		  RealType pVal = p[cell];
-		  double nx =mesh.getDimensions().x(), ny=mesh.getDimensions().y();
-		  vel[mesh.template getFaceNextToCell<-1,0>(cell)] -= 0.5*nx*pVal;
-		  vel[mesh.template getFaceNextToCell<+1,0>(cell)] += 0.5*nx*pVal;
-		  vel[mesh.template getFaceNextToCell<0,-1>(cell)] -= 0.5*ny*pVal;
-		  vel[mesh.template getFaceNextToCell<0,+1>(cell)] += 0.5*ny*pVal;
-	  }
-
-	  void project(const MeshType& mesh)
-	  {
-		  typedef tnlConstantFunction< Dimensions, RealType > ConstantFunction;
-		  typedef tnlLinearDiffusion< MeshType, RealType, IndexType> LinDiffOper;
-		  typedef tnlAnalyticNeumannBoundaryConditions< MeshType, ConstantFunction, RealType, IndexType > BoundaryConditions;
-
-		   tnlLinearSystemAssembler< MeshType,
-									tnlVector<RealType, DeviceType, IndexType>,
-									LinDiffOper,
-									BoundaryConditions,
-									ConstantFunction,
-									tnlNoTimeDiscretisation,
-									MatrixType > systemAssembler;
-		  LinDiffOper linDiffOper;
-		  BoundaryConditions boundaryConditions;
-		  ConstantFunction zeroFunc;
-
-		  systemAssembler.template assembly< Mesh::Dimensions >( (RealType)0,
-																 (RealType)0,
-																 mesh,
-																 linDiffOper,
-																 boundaryConditions,
-																 zeroFunc, //rhs func
-																 p,
-																 poissonMat,
-																 p_rhs );
-
-		  //_matSolver.setMatrix(poissonMat);
-		  //_matSolver.solve(p_rhs,p);
-		  int nx = mesh.getDimensions().x(), ny=mesh.getDimensions().y();
-		  for ( int i=0 ; i< nx; i++ ) for (int j=0 ; j< ny; j++ )
-			  computeVelocityDivergence(mesh.getCellIndex(CoordinatesType( i, j )), vel, mesh, p_rhs);
-
-		  for (int x=1; x< nx-1; x++) for(int y=1; y < ny-1; y++)
-			 updateVelocityByPressureCorrection(mesh.getCellIndex(CoordinatesType(x,y)),vel, mesh, p);
-	  }
-
-	  void save(const char * filename, const MeshType& mesh)
-	  {
-			//FILE * pFile = fopen (filename, "w");
-			//fprintf(pFile, "#X	Y	u	v\n");
-			int nx = mesh.getDimensions().x(), ny=mesh.getDimensions().y(), n=nx*ny;
-			int dims[] = {nx,ny,1};
-			double *vars = new double[n*3];
-			double *vvars[] = {vars};
-
-			int varDim[] = {3};
-			int centering[] = {0};
-			const char * names[] = {"Rychlost"};
-
-			for (IndexType j=0 ; j< ny ; j++ ) for ( IndexType i=0 ; i< nx ; i++ )
-			{
-				IndexType cell = mesh.getCellIndex(typename MeshType::CoordinatesType(i,j));
-				int ii = 3*(j*nx+i);
-				vars[ii+0] = getCenterU(mesh, cell);
-				vars[ii+1] = getCenterV(mesh, cell);
-				vars[ii+2] = 0;
-				//fprintf(pFile, "%lg	%lg	%lg	%lg\n", (RealType)i, (RealType)j, getCenterU(mesh, cell), getCenterV(mesh, cell));
-			}
-			//fclose (pFile);
-			void write_regular_mesh(const char *filename, int useBinary, int *dims,
-									int nvars, int *vardim, int *centering,
-									const char * const *varnames, double **vars);
-			write_regular_mesh(filename, 0, dims, 1, varDim, centering, names, vvars );
-			delete[] vars;
-	  }
-};
-
-#include "tnlIncompressibleNavierStokesProblem_impl.h"
-
-#endif /* TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_H_ */
-
-
-//Refaktor, do objektu, setup na parametry, laplace podle tnlLinearDiffusion
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem_impl.h b/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem_impl.h
deleted file mode 100644
index 995c404b67e7db8c7095bf3db3b7e4ffe2ed57b1..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnlIncompressibleNavierStokesProblem_impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/***************************************************************************
-                          tnlIncompressibleNavierStokesProblem_impl.h  -  description
-                             -------------------
-    begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_IMPL_H_
-#define TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_IMPL_H_
-
-#include <core/mfilename.h>
-#include <matrices/tnlMatrixSetter.h>
-#include <matrices/tnlMultidiagonalMatrixSetter.h>
-#include <core/tnlLogger.h>
-#include <solvers/pde/tnlExplicitUpdater.h>
-#include <solvers/pde/tnlLinearSystemAssembler.h>
-#include <solvers/pde/tnlBackwardTimeDiscretisation.h>
-
-
-
-
-
-
-
-
-
-
-
-
-#endif /* TNLINCOMPRESSIBLENAVIERSTOKESPROBLEM_IMPL_H_ */
diff --git a/src/TNL/legacy/incompressible-navier-stokes/tnlNSFastBuildConfig.h b/src/TNL/legacy/incompressible-navier-stokes/tnlNSFastBuildConfig.h
deleted file mode 100644
index d683048bfd40f6522eb26b6fa3ab84218249ff61..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/tnlNSFastBuildConfig.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************
-                          tnlNSFastBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLNSFASTBUILDCONFIG_H_
-#define TNLNSFASTBUILDCONFIG_H_
-
-class tnlNSFastBuildConfig
-{
-   public:
-
-      static void print() { cerr << "tnlNSFastBuildConfig" << endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< tnlNSFastBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< tnlNSFastBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< tnlNSFastBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< tnlNSFastBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * 1, 2, and 3 dimensions are enabled by default
- */
-template<> struct tnlConfigTagDimensions< tnlNSFastBuildConfig, 1 >{ enum { enabled = false }; };
-template<> struct tnlConfigTagDimensions< tnlNSFastBuildConfig, 2 >{ enum { enabled = true }; };
-template<> struct tnlConfigTagDimensions< tnlNSFastBuildConfig, 3 >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< tnlNSFastBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< tnlNSFastBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< tnlNSFastBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< tnlNSFastBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< tnlNSFastBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< tnlNSFastBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< tnlNSFastBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< tnlNSFastBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< tnlNSFastBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* TNLNSFASTBUILDCONFIG_H_ */
diff --git a/src/TNL/legacy/incompressible-navier-stokes/visit_writer.cpp b/src/TNL/legacy/incompressible-navier-stokes/visit_writer.cpp
deleted file mode 100644
index c27478eacd5849a2cd2c8425007c5c27cff3977a..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/visit_writer.cpp
+++ /dev/null
@@ -1,1066 +0,0 @@
-/* ************************************************************************* //
-//                             visit_writer.c                                //
-// ************************************************************************* */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "visit_writer.h" //mozna uvozovky
-
-
-/*
- * Globals.
- */
-
-static FILE *fp = NULL;
-static int useBinary = 0;
-static int numInColumn = 0;
-
-
-/* ****************************************************************************
- *  Function: end_line
- *
- *  Purpose:
- *      If floats or ints have been written using the write_float or write_int
- *      functions, this will issue a newline (if necessary) so that a new
- *      heading can be placed.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void end_line(void)
-{
-    if (!useBinary)
-    {
-        char str2[8] = "\n";
-        fprintf(fp, str2);
-        numInColumn = 0;
-    }
-}
-
-
-/* ****************************************************************************
- *  Function: open_file
- *
- *  Purpose:
- *      Opens a file for writing and assigns the handle to the global variable
- *      "fp".
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void open_file(const char *filename)
-{
-    char full_filename[1024];
-    if (strstr(filename, ".vtk") != NULL)
-    {
-        strcpy(full_filename, filename);
-    }
-    else
-    {
-        sprintf(full_filename, "%s.vtk", filename);
-    }
-
-    fp = fopen(full_filename, "w+");
-}
-
-
-/* ****************************************************************************
- *  Function: close_file
- *
- *  Purpose:
- *      Closes the file with handle "fp" (a global variable).
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void close_file(void)
-{
-    end_line();
-    fclose(fp);
-    fp = NULL;
-}
-
-
-/* ****************************************************************************
- *  Function: force_big_endian
- *
- *  Purpose:
- *      Determines if the machine is little-endian.  If so, then, for binary
- *      data, it will force the data to be big-endian.
- *
- *  Note:       This assumes that all inputs are 4 bytes long.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void force_big_endian(unsigned char *bytes)
-{
-    static int doneTest = 0;
-    static int shouldSwap = 0;
-    if (!doneTest)
-    {
-        int tmp1 = 1;
-        unsigned char *tmp2 = (unsigned char *) &tmp1;
-        if (*tmp2 != 0)
-            shouldSwap = 1;
-        doneTest = 1;
-    }
-
-    if (shouldSwap & useBinary)
-    {
-        unsigned char tmp = bytes[0];
-        bytes[0] = bytes[3];
-        bytes[3] = tmp;
-        tmp = bytes[1];
-        bytes[1] = bytes[2];
-        bytes[2] = tmp;
-    }
-}
-
-/* ****************************************************************************
- *  Function: force_double_big_endian
- *
- *  Purpose:
- *      Determines if the machine is little-endian.  If so, then, for binary
- *      data, it will force the data to be big-endian.
- *
- *  Note:       This assumes that all inputs are 8 bytes long.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- *
- * ************************************************************************* */
-
-static void force_double_big_endian(unsigned char *bytes)
-{
-    static int doneTest = 0;
-    static int shouldSwap = 0;
-    if (!doneTest)
-    {
-        int tmp1 = 1;
-        unsigned char *tmp2 = (unsigned char *) &tmp1;
-        if (*tmp2 != 0)
-            shouldSwap = 1;
-        doneTest = 1;
-    }
-
-    if (shouldSwap & useBinary)
-    {
-        unsigned char tmp = bytes[0];
-        bytes[0] = bytes[7];
-        bytes[7] = tmp;
-        tmp = bytes[1];
-        bytes[1] = bytes[6];
-        bytes[6] = tmp;
-        tmp = bytes[2];
-        bytes[2] = bytes[5];
-        bytes[5] = tmp;
-        tmp = bytes[3];
-        bytes[3] = bytes[4];
-        bytes[4] = tmp;
-    }
-}
-
-
-/* ****************************************************************************
- *  Function: write_string
- *
- *  Purpose:
- *      Writes a character to the open file.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void write_string(const char *str)
-{
-    fprintf(fp, str);
-}
-
-
-/* ****************************************************************************
- *  Function: new_section
- *
- *  Purpose:
- *      Adds a new line, provided we didn't already just do so and we are
- *      writing an ASCII file.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void new_section(void)
-{
-    if (numInColumn != 0)
-        end_line();
-    numInColumn = 0;
-}
-
-
-/* ****************************************************************************
- *  Function: write_int
- *
- *  Purpose:
- *      Writes an integer to the currently open file.  This routine takes
- *      care of ASCII vs binary issues.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void write_int(int val)
-{
-    if (useBinary)
-    {
-        force_big_endian((unsigned char *) &val);
-        fwrite(&val, sizeof(int), 1, fp);
-    }
-    else
-    {
-        char str[128];
-        sprintf(str, "%d ", val);
-        fprintf(fp, str);
-        if (((numInColumn++) % 9) == 8)
-        {
-            char str2[8] = "\n";
-            fprintf(fp, str2);
-            numInColumn = 0;
-        }
-    }
-}
-
-
-/* ****************************************************************************
- *  Function: write_float
- *
- *  Purpose:
- *      Writes an float to the currently open file.  This routine takes
- *      care of ASCII vs binary issues.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- *  Modifications:
- *  
- *    Hank Childs, Fri Apr 22 09:14:44 PDT 2005
- *    Make precision changes suggested by Jeff McAninch
- *
- * ************************************************************************* */
-
-static void write_float(float val)
-{
-    if (useBinary)
-    {
-        force_big_endian((unsigned char *) &val);
-        fwrite(&val, sizeof(float), 1, fp);
-    }
-    else
-    {
-        char str[128];
-        sprintf(str, "%20.12e ", val);
-        fprintf(fp, str);
-        if (((numInColumn++) % 9) == 8)
-        {
-            end_line();
-        }
-    }
-}
-/* ****************************************************************************
- *  Function: write_double
- *
- *  Purpose:
- *      Writes a double to the currently open file.  This routine takes
- *      care of ASCII vs binary issues.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- *
- *  Modifications:
- *
- *    Hank Childs, Fri Apr 22 09:14:44 PDT 2005
- *    Make precision changes suggested by Jeff McAninch
- *
- * ************************************************************************* */
-
-static void write_double(double val)
-{
-    if (useBinary)
-    {
-        force_double_big_endian((unsigned char *) &val);
-        fwrite(&val, sizeof(double), 1, fp);
-    }
-    else
-    {
-        char str[128];
-        sprintf(str, "%20.12e ", val);
-        fprintf(fp, str);
-        if (((numInColumn++) % 9) == 8)
-        {
-            end_line();
-        }
-    }
-}
-
-
-/* ****************************************************************************
- *  Function: write_header
- *
- *  Purpose:
- *      Writes the standard VTK header to the file.  This should be the first
- *      thing written to the file.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static void write_header(void)
-{
-    fprintf(fp, "# vtk DataFile Version 2.0\n");
-    fprintf(fp, "Written using VisIt writer\n");
-    if (useBinary)
-        fprintf(fp, "BINARY\n");
-    else
-        fprintf(fp, "ASCII\n");
-}
-
-
-/* ****************************************************************************
- *  Function: write_variables
- *
- *  Purpose:
- *      Writes the variables to the file.  This can be a bit tricky.  The
- *      cell data must be written first, followed by the point data.  When
- *      writing the [point|cell] data, one variable must be declared the
- *      primary scalar and another the primary vector (provided scalars
- *      or vectors exist).  The rest of the arrays are added through the
- *      "field data" mechanism.  Field data should support groups of arrays 
- *      with different numbers of components (ie a scalar and a vector), but
- *      there is a failure with the VTK reader.  So the scalars are all written
- *      one group of field data and then the vectors as another.  If you don't
- *      write it this way, the vectors do not show up.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-void write_variables(int nvars, int *vardim, int *centering, 
-                     const char * const * varname, double **vars,
-                     int npts, int ncells)
-{
-    char str[1024];
-    int i, j, first_scalar, first_vector;
-    int num_scalars, num_vectors;
-    int num_field = 0;
-
-    new_section();
-    sprintf(str, "CELL_DATA %d\n", ncells);
-    write_string(str);
-
-    first_scalar = 0;
-    first_vector = 0;
-    num_scalars = 0;
-    num_vectors = 0;
-    /* The field data is where the non-primary scalars and vectors are 
-     * stored.  They must all be grouped together at the end of the point
-     * data.  So write out the primary scalars and vectors first.
-     */
-    for (i = 0 ; i < nvars ; i++)
-    {
-        if (centering[i] == 0)
-        {
-            int num_to_write = 0;
-            int should_write = 0;
-
-            if (vardim[i] == 1)
-            {
-                if (first_scalar == 0)
-                {
-                    should_write = 1;
-                    sprintf(str, "SCALARS %s double\n", varname[i]);
-                    write_string(str);
-                    write_string("LOOKUP_TABLE default\n");
-                    first_scalar = 1;
-                }
-                else 
-                    num_scalars++;
-            }
-            else if (vardim[i] == 3)
-            {
-                if (first_vector == 0)
-                {
-                    should_write = 1;
-                    sprintf(str, "VECTORS %s double\n", varname[i]);
-                    write_string(str);
-                    first_vector = 1;
-                }
-                else 
-                    num_vectors++;
-            }
-            else
-            {
-                printf("Only supported variable dimensions are 1 and 3.\n");
-                printf("Ignoring variable %s.\n", varname[i]);
-                continue;
-            }
-
-            if (should_write)
-            {
-                num_to_write = ncells*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-
-    first_scalar = 0;
-    if (num_scalars > 0)
-    {
-        sprintf(str, "FIELD FieldData %d\n", num_scalars);
-        write_string(str);
-        for (i = 0 ; i < nvars ; i++)
-        {
-            int should_write = 0;
-            if (centering[i] == 0)
-            {
-                if (vardim[i] == 1)
-                {
-                    if (first_scalar == 0)
-                    {
-                        first_scalar = 1;
-                    }
-                    else
-                    {
-                        should_write = 1;
-                        sprintf(str, "%s 1 %d double\n", varname[i], ncells);
-                        write_string(str);
-                    }
-                }
-            }
-
-            if (should_write)
-            {
-                int num_to_write = ncells*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-
-    first_vector = 0;
-    if (num_vectors > 0)
-    {
-        sprintf(str, "FIELD FieldData %d\n", num_vectors);
-        write_string(str);
-        for (i = 0 ; i < nvars ; i++)
-        {
-            int should_write = 0;
-            if (centering[i] == 0)
-            {
-                int num_to_write = 0;
-    
-                if (vardim[i] == 3)
-                {
-                    if (first_vector == 0)
-                    {
-                        first_vector = 1;
-                    }
-                    else
-                    {
-                        should_write = 1;
-                        sprintf(str, "%s 3 %d double\n", varname[i], ncells);
-                        write_string(str);
-                    }
-                }
-            }
-
-            if (should_write)
-            {
-                int num_to_write = ncells*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-
-    new_section();
-    sprintf(str, "POINT_DATA %d\n", npts);
-    write_string(str);
-
-    first_scalar = 0;
-    first_vector = 0;
-    num_scalars = 0;
-    num_vectors = 0;
-    /* The field data is where the non-primary scalars and vectors are 
-     * stored.  They must all be grouped together at the end of the point
-     * data.  So write out the primary scalars and vectors first.
-     */
-    for (i = 0 ; i < nvars ; i++)
-    {
-        if (centering[i] != 0)
-        {
-            int num_to_write = 0;
-            int should_write = 0;
-
-            if (vardim[i] == 1)
-            {
-                if (first_scalar == 0)
-                {
-                    should_write = 1;
-                    sprintf(str, "SCALARS %s double\n", varname[i]);
-                    write_string(str);
-                    write_string("LOOKUP_TABLE default\n");
-                    first_scalar = 1;
-                }
-                else 
-                    num_scalars++;
-            }
-            else if (vardim[i] == 3)
-            {
-                if (first_vector == 0)
-                {
-                    should_write = 1;
-                    sprintf(str, "VECTORS %s double\n", varname[i]);
-                    write_string(str);
-                    first_vector = 1;
-                }
-                else 
-                    num_vectors++;
-            }
-            else
-            {
-                printf("Only supported variable dimensions are 1 and 3.\n");
-                printf("Ignoring variable %s.\n", varname[i]);
-                continue;
-            }
-
-            if (should_write)
-            {
-                num_to_write = npts*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-
-    first_scalar = 0;
-    if (num_scalars > 0)
-    {
-        sprintf(str, "FIELD FieldData %d\n", num_scalars);
-        write_string(str);
-        for (i = 0 ; i < nvars ; i++)
-        {
-            int should_write = 0;
-            if (centering[i] != 0)
-            {
-                if (vardim[i] == 1)
-                {
-                    if (first_scalar == 0)
-                    {
-                        first_scalar = 1;
-                    }
-                    else
-                    {
-                        should_write = 1;
-                        sprintf(str, "%s 1 %d double\n", varname[i], npts);
-                        write_string(str);
-                    }
-                }
-            }
-
-            if (should_write)
-            {
-                int num_to_write = npts*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-
-    first_vector = 0;
-    if (num_vectors > 0)
-    {
-        sprintf(str, "FIELD FieldData %d\n", num_vectors);
-        write_string(str);
-        for (i = 0 ; i < nvars ; i++)
-        {
-            int should_write = 0;
-            if (centering[i] != 0)
-            {
-                int num_to_write = 0;
-    
-                if (vardim[i] == 3)
-                {
-                    if (first_vector == 0)
-                    {
-                        first_vector = 1;
-                    }
-                    else
-                    {
-                        should_write = 1;
-                        sprintf(str, "%s 3 %d double\n", varname[i], npts);
-                        write_string(str);
-                    }
-                }
-            }
-
-            if (should_write)
-            {
-                int num_to_write = npts*vardim[i];
-                for (j = 0 ; j < num_to_write ; j++)
-                {
-                    write_double(vars[i][j]);
-                }
-                end_line();
-            }
-        }
-    }
-}
-
-
-/* ****************************************************************************
-//  Function: write_point_mesh
-//
-//  Purpose:
-//      Writes out a point mesh.
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      npts       The number of points in the mesh.
-//      pts        The spatial locations of the points.  This array should
-//                 be size 3*npts.  The points should be encoded as:
-//                 <x1, y1, z1, x2, y2, z2, ..., xn, yn, zn>
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_point_mesh(const char *filename, int ub, int npts, float *pts,
-                      int nvars, int *vardim, const char * const *varnames,
-                      double **vars)
-{
-    int   i;
-    char  str[128];
-    int  *centering = NULL;
-
-    useBinary = ub;
-    open_file(filename);
-    write_header();
-
-    write_string("DATASET UNSTRUCTURED_GRID\n");
-    sprintf(str, "POINTS %d float\n", npts);
-    write_string(str);
-    for (i = 0 ; i < 3*npts ; i++)
-    {
-        write_float(pts[i]);
-    }
-
-    new_section();
-    sprintf(str, "CELLS %d %d\n", npts, 2*npts);
-    write_string(str);
-    for (i = 0 ; i < npts ; i++)
-    {
-        write_int(1);
-        write_int(i);
-        end_line();
-    }
-
-    new_section();
-    sprintf(str, "CELL_TYPES %d\n", npts);
-    write_string(str);
-    for (i = 0 ; i < npts ; i++)
-    {
-        write_int(VISIT_VERTEX);
-        end_line();
-    }
-
-    centering = (int *) malloc(nvars*sizeof(int));
-    for (i = 0 ; i < nvars ; i++)
-        centering[i] = 1;
-    write_variables(nvars, vardim, centering, varnames, vars, npts, npts);
-    free(centering);
-
-    close_file();
-}
-
-
-/* ****************************************************************************
- *  Function: num_points_for_cell
- *
- *  Purpose:
- *      Determines the number of points for the type of cell.
- *
- *  Programmer: Hank Childs
- *  Creation:   September 3, 2004
- * 
- * ************************************************************************* */
-
-static int num_points_for_cell(int celltype)
-{
-    int npts = 0;
-    switch (celltype)
-    {
-       case VISIT_VERTEX:
-         npts = 1;
-         break;
-       case VISIT_LINE:
-         npts = 2;
-         break;
-       case VISIT_TRIANGLE:
-         npts = 3;
-         break;
-       case VISIT_QUAD:
-         npts = 4;
-         break;
-       case VISIT_TETRA:
-         npts = 4;
-         break;
-       case VISIT_HEXAHEDRON:
-         npts = 8;
-         break;
-       case VISIT_WEDGE:
-         npts = 6;
-         break;
-       case VISIT_PYRAMID:
-         npts = 5;
-         break;
-    }
-    return npts;
-}
-
-
-/* ****************************************************************************
-//  Function: write_unstructured_mesh
-//
-//  Purpose:
-//      Writes out a unstructured mesh.
-//
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      npts       The number of points in the mesh.
-//      pts        The spatial locations of the points.  This array should
-//                 be size 3*npts.  The points should be encoded as:
-//                 <x1, y1, z1, x2, y2, z2, ..., xn, yn, zn>
-//      ncells     The number of cells.
-//      celltypes  The type of each cell.
-//      conn       The connectivity array.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_unstructured_mesh(const char *filename, int ub, int npts,
-                             float *pts, int ncells, int *celltypes, int *conn,
-                             int nvars, int *vardim, int *centering,
-                             const char * const *varnames, double **vars)
-{
-    int   i, j;
-    char  str[128];
-    int   conn_size = 0;
-    int  *curr_conn = conn;
-
-    useBinary = ub;
-    open_file(filename);
-    write_header();
-
-    write_string("DATASET UNSTRUCTURED_GRID\n");
-    sprintf(str, "POINTS %d float\n", npts);
-    write_string(str);
-    for (i = 0 ; i < 3*npts ; i++)
-    {
-        write_float(pts[i]);
-    }
-
-    new_section();
-    for (i = 0 ; i < ncells ; i++)
-    {
-        int npts = num_points_for_cell(celltypes[i]);
-         
-        conn_size += npts+1;
-    }
-    sprintf(str, "CELLS %d %d\n", ncells, conn_size);
-    write_string(str);
-    for (i = 0 ; i < ncells ; i++)
-    {
-        int npts = num_points_for_cell(celltypes[i]);
-        write_int(npts);
-        for (j = 0 ; j < npts ; j++)
-            write_int(*curr_conn++);
-        end_line();
-    }
-
-    new_section();
-    sprintf(str, "CELL_TYPES %d\n", ncells);
-    write_string(str);
-    for (i = 0 ; i < ncells ; i++)
-    {
-        write_int(celltypes[i]);
-        end_line();
-    }
-
-    write_variables(nvars, vardim, centering, varnames, vars, npts, ncells);
-
-    close_file();
-}
-
-
-/* ****************************************************************************
-//  Function: write_rectilinear_mesh
-//
-//  Purpose:
-//      Writes out a rectilinear mesh.
-//
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nX, nY, nZ }, where nX is the
-//                 number of points in the X-dimension, etc.
-//      x          An array of size dims[0] that contains the x-coordinates.
-//      y          An array of size dims[1] that contains the x-coordinates.
-//      z          An array of size dims[2] that contains the x-coordinates.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-//  Modifications:
-//
-//    Hank Childs, Wed Apr  6 16:22:57 PDT 2005
-//    Fix problem with 2D structured meshes and assessing cell count.
-//
-// ***************************************************************************/
-
-void write_rectilinear_mesh(const char *filename, int ub, int *dims,
-                            float *x, float *y, float *z,
-                            int nvars, int *vardim, int *centering,
-                            const char * const *varnames, double **vars)
-{
-    int   i, j;
-    char  str[128];
-    int npts = dims[0]*dims[1]*dims[2];
-    int ncX = (dims[0] - 1 < 1 ? 1 : dims[0] - 1);
-    int ncY = (dims[1] - 1 < 1 ? 1 : dims[1] - 1);
-    int ncZ = (dims[2] - 1 < 1 ? 1 : dims[2] - 1);
-    int ncells = ncX*ncY*ncZ;
-
-    useBinary = ub;
-    open_file(filename);
-    write_header();
-
-    write_string("DATASET RECTILINEAR_GRID\n");
-    sprintf(str, "DIMENSIONS %d %d %d\n", dims[0], dims[1], dims[2]);
-    write_string(str);
-    sprintf(str, "X_COORDINATES %d float\n", dims[0]);
-    write_string(str);
-    for (i = 0 ; i < dims[0] ; i++)
-        write_float(x[i]);
-    new_section();
-    sprintf(str, "Y_COORDINATES %d float\n", dims[1]);
-    write_string(str);
-    for (i = 0 ; i < dims[1] ; i++)
-        write_float(y[i]);
-    new_section();
-    sprintf(str, "Z_COORDINATES %d float\n", dims[2]);
-    write_string(str);
-    for (i = 0 ; i < dims[2] ; i++)
-        write_float(z[i]);
-
-    write_variables(nvars, vardim, centering, varnames, vars, npts, ncells);
-
-    close_file();
-}
-
-
-/* ****************************************************************************
-//  Function: write_regular_mesh
-//
-//  Purpose:
-//      Writes out a regular mesh.  A regular mesh is one where the data lies
-//      along regular intervals.  "Brick of bytes/doubles",
-//      "Block of bytes/double", and MRI data all are examples of data that
-//      lie on regular meshes.
-//
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nX, nY, nZ }, where nX is the
-//                 number of points in the X-dimension, etc.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_regular_mesh(const char *filename, int ub, int *dims,
-                        int nvars, int *vardim, int *centering,
-                        const char * const *varnames, double **vars)
-{
-    int  i;
-
-    float *x = (float *) malloc(sizeof(float)*dims[0]);
-    float *y = (float *) malloc(sizeof(float)*dims[1]);
-    float *z = (float *) malloc(sizeof(float)*dims[2]);
-
-    for (i = 0 ; i < dims[0] ; i++)
-        x[i] = (float) i;
-    for (i = 0 ; i < dims[1] ; i++)
-        y[i] = (float) i;
-    for (i = 0 ; i < dims[2] ; i++)
-        z[i] = (float) i;
-
-    write_rectilinear_mesh(filename, ub, dims, x, y, z, nvars, vardim,
-                           centering, varnames, vars);
-
-    free(x);
-    free(y);
-    free(z);
-}
-
-
-/* ****************************************************************************
-//  Function: write_curvilinear_mesh
-//
-//  Purpose:
-//      Writes out a curvilinear mesh.
-//
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nI, nJ, nK }, where nI is the
-//                 number of points in the logical I dimension, etc.
-//      pts        An array of size nI*nJ*nK*3.  The array should be layed
-//                 out as (pt(i=0,j=0,k=0), pt(i=1,j=0,k=0), ...
-//                 pt(i=nI-1,j=0,k=0), pt(i=0,j=1,k=0), ...).
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-//  Modifications:
-//
-//    Hank Childs, Wed Apr  6 16:22:57 PDT 2005
-//    Fix problem with 2D structured meshes and assessing cell count.
-//
-// ***************************************************************************/
-
-void write_curvilinear_mesh(const char *filename, int ub, int *dims,float *pts,
-                            int nvars, int *vardim, int *centering,
-                            const char * const *varnames, double **vars)
-{
-    int   i, j;
-    char  str[128];
-    int npts = dims[0]*dims[1]*dims[2];
-    int ncX = (dims[0] - 1 < 1 ? 1 : dims[0] - 1);
-    int ncY = (dims[1] - 1 < 1 ? 1 : dims[1] - 1);
-    int ncZ = (dims[2] - 1 < 1 ? 1 : dims[2] - 1);
-    int ncells = ncX*ncY*ncZ;
-
-    useBinary = ub;
-    open_file(filename);
-    write_header();
-
-    write_string("DATASET STRUCTURED_GRID\n");
-    sprintf(str, "DIMENSIONS %d %d %d\n", dims[0], dims[1], dims[2]);
-    write_string(str);
-    sprintf(str, "POINTS %d float\n", npts);
-    write_string(str);
-    for (i = 0 ; i < 3*npts ; i++)
-    {
-        write_float(pts[i]);
-    }
-
-    write_variables(nvars, vardim, centering, varnames, vars, npts, ncells);
-
-    close_file();
-}
-
-
diff --git a/src/TNL/legacy/incompressible-navier-stokes/visit_writer.h b/src/TNL/legacy/incompressible-navier-stokes/visit_writer.h
deleted file mode 100644
index ae67e3ccc8de624ccf7f027cddf223cc32cd476c..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/incompressible-navier-stokes/visit_writer.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* ************************************************************************* //
-//                              visit_writer.h                               //
-// ************************************************************************* */
-
-/* 
-// This file contains function prototypes for writing out point meshes, 
-// unstructured meshes, rectilinear meshes, regular meshes, and 
-// structured/curvilinear meshes into files that can later be read by VisIt.
-//
-// Each routine assumes that the data being written is three-dimensional.
-// If the data is two-dimensional, you must still write out the data
-// as three-dimensional (ie pad arrays so that they are the correct size, etc).
-// However: the VisIt reader will determine that the data is truly two-
-// dimensional and visualize it as a two-dimensional dataset.
-//
-// All writers have an ASCII vs Binary decision.  The tradeoffs are the
-// standard ones: ASCII is human readable, but slow.  The
-// binary is much faster, but not human readable.  Note: the binary format
-// is portable, since it converts all data to be big-endian (this was a 
-// design decision for the format the visit_writer writes to -- the VTK
-// format).
-//
-// If you have multiple grids, you can write out one file for each grid.
-// There are potential pitfalls in doing this, where extra geometry and
-// interpolation problems appear along grid boundaries.  For additional
-// help with this issue, e-mail visit-help@llnl.gov
-*/
-
-
-/* ****************************************************************************
-//  Function: write_point_mesh
-//
-//  Purpose:
-//      Writes out a point mesh.
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      npts       The number of points in the mesh.
-//      pts        The spatial locations of the points.  This array should
-//                 be size 3*npts.  The points should be encoded as:
-//                 <x1, y1, z1, x2, y2, z2, ..., xn, yn, zn>
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//      
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_point_mesh(const char *filename, int useBinary, int npts, 
-                      float *pts, int nvars, int *vardim, 
-                      const char * const *varnames, double **vars);
-
-
-
-/* ****************************************************************************
-//  Function: write_unstructured_mesh
-//
-//  Purpose:
-//      Writes out a unstructured mesh.
-//     
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      npts       The number of points in the mesh.
-//      pts        The spatial locations of the points.  This array should
-//                 be size 3*npts.  The points should be encoded as:
-//                 <x1, y1, z1, x2, y2, z2, ..., xn, yn, zn>
-//      ncells     The number of cells.
-//      celltypes  The type of each cell.
-//      conn       The connectivity array.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering 
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//      
-//  Example:
-//      You have two triangles.  The first has points (0,0,0), (0,1,0), and
-//      (1,1,0).  The second has points (0,0,0), (1,1,0), and (1,0,0).
-//
-//      There are four unique points. 
-//
-//      float pts[12] = { 0,0,0, 0,1,0, 1,1,0, 1,0,0 };
-//
-//      It is important the points list contain only unique points,
-//      because VisIt is not able to correctly determine the connectivity of a 
-//      dataset when points are duplicated.
-//
-//      There are two triangles.
-//      int ncells = 2;
-//
-//      The cells are both triangles.
-//      int celltypes[2] = { VISIT_TRIANGLE, VISIT_TRIANGLE };
-//
-//      The connectivity contains indices into the points list.  The indexing
-//      assumes that each point has size 3 (x,y,z).
-//
-//      int conn[6] = { 0, 1, 2, 0, 2, 3 };
-//
-//  Hint:  
-//      When writing an unstructured mesh, it is easy to get the orientation
-//      of a cell backwards.  VisIt typically does okay with this, but it
-//      can cause problems.  To test if this is happening, bring up VisIt on
-//      your newly outputted dataset and make a Pseudocolor plot of 
-//      "mesh_quality/volume" for 3D datasets or "mesh_quality/area" for 2D 
-//      datasets.  If the cells are inside-out, the volumes or areas will be
-//      negative.
-//      
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-#define VISIT_VERTEX         1
-#define VISIT_LINE           3
-#define VISIT_TRIANGLE       5
-#define VISIT_QUAD           9
-#define VISIT_TETRA         10
-#define VISIT_HEXAHEDRON    12
-#define VISIT_WEDGE         13
-#define VISIT_PYRAMID       14
-
-void write_unstructured_mesh(const char *filename, int useBinary, int npts,
-                             float *pts, int ncells, int *celltypes, int *conn, 
-                             int nvars, int *vardim, int *centering,
-                             const char * const *varnames, double **vars);
-
-
-
-/* ****************************************************************************
-//  Function: write_regular_mesh
-//
-//  Purpose:
-//      Writes out a regular mesh.  A regular mesh is one where the data lies
-//      along regular intervals.  "Brick of bytes/floats", 
-//      "Block of bytes/floats", and MRI data all are examples of data that
-//      lie on regular meshes.
-//     
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nX, nY, nZ }, where nX is the
-//                 number of points in the X-dimension, etc.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering 
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//      
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_regular_mesh(const char *filename, int useBinary, int *dims, 
-                        int nvars, int *vardim, int *centering,
-                        const char * const *varnames, double **vars);
-
-
-
-
-/* ****************************************************************************
-//  Function: write_rectilinear_mesh
-//
-//  Purpose:
-//      Writes out a rectilinear mesh.
-//     
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nX, nY, nZ }, where nX is the
-//                 number of points in the X-dimension, etc.
-//      x          An array of size dims[0] that contains the x-coordinates.
-//      y          An array of size dims[1] that contains the x-coordinates.
-//      z          An array of size dims[2] that contains the x-coordinates.
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering 
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//      
-//
-//  Example:
-//      You have a rectilinear mesh with x = { 0, 1, 2}, y = { 1, 1.5, 2, 3 },
-//      and z = { 2.5, 3.5 }.
-//
-//      Then dims = { 3, 4, 2 }.
-//      
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_rectilinear_mesh(const char *filename, int useBinary, 
-                            int *dims, float *x, float *y, float *z, 
-                            int nvars, int *vardim, int *centering, 
-                            const char * const *varnames, double **vars);
-
-
-
-
-/* ****************************************************************************
-//  Function: write_curvilinear_mesh
-//
-//  Purpose:
-//      Writes out a curvilinear mesh.
-//     
-//
-//  Arguments:
-//      filename   The name of the file to write.  If the extension ".vtk" is
-//                 not present, it will be added.
-//      useBinary  '0' to write ASCII, !0 to write binary
-//      dims       An array of size 3 = { nI, nJ, nK }, where nI is the
-//                 number of points in the logical I dimension, etc.
-//      pts        An array of size nI*nJ*nK*3.  The array should be layed
-//                 out as (pt(i=0,j=0,k=0), pt(i=1,j=0,k=0), ...
-//                 pt(i=nI-1,j=0,k=0), pt(i=0,j=1,k=0), ...).
-//      nvars      The number of variables.
-//      vardim     The dimension of each variable.  The size of vardim should
-//                 be nvars.  If var i is a scalar, then vardim[i] = 1.
-//                 If var i is a vector, then vardim[i] = 3.
-//      centering  The centering of each variable.  The size of centering 
-//                 should be nvars.  If centering[i] == 0, then the variable
-//                 is cell-based.  If centering[i] != 0, then the variable
-//                 is point-based.
-//      vars       An array of variables.  The size of vars should be nvars.
-//                 The size of vars[i] should be npts*vardim[i].
-//      
-//
-//  Programmer: Hank Childs
-//  Creation:   September 2, 2004
-//
-// ***************************************************************************/
-
-void write_curvilinear_mesh(const char *filename, int useBinary, 
-                            int *dims, float *pts,
-                            int nvars, int *vardim, int *centering, 
-                            const char * const *varnames, double **vars);
-
-
-
diff --git a/src/TNL/legacy/mesh/tnlDistributedGrid.h b/src/TNL/legacy/mesh/tnlDistributedGrid.h
deleted file mode 100644
index 6cdc286e9535c54e19a47e92cae58e286d6feaca..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/mesh/tnlDistributedGrid.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************
-                          tnlDistributedGrid.h  -  description
-                             -------------------
-    begin                : Feb 26, 2011
-    copyright            : (C) 2011 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLDISTRIBUTEDGRID_H_
-#define TNLDISTRIBUTEDGRID_H_
-
-#include <TNL/Object.h>
-#include <TNL/tnlCommunicator.h>
-
-template< int Dimension,
-          typename GridType,
-          typename Device = Devices::Host,
-          typename Real = double,
-          typename Index = int >
-class tnlDistributedGrid : public Object
-{
-   //! We do not allow constructor without parameters.
-   tnlDistributedGrid();
-
-   //! We do not allow copy constructor without object name.
-   tnlDistributedGrid( const tnlDistributedGrid< Dimension, Real, Device, Index >& a );
-
-   public:
-
-   tnlDistributedGrid( const String& name );
-
-   bool init( tnlCommunicator* communicator,
-              const GridType& grid,
-              const StaticVector< Dimension, Index >& subdomainOverlaps );
-
-   tnlCommunicator< Device >* getCommunicator() const;
-
-   const StaticVector< Dimension, Real >& getDomainLowerCorner() const;
-
-   const StaticVector< Dimension, Real >& getDomainUpperCorner() const;
-
-   const StaticVector< Dimension, Index >& getDimensions() const;
-
-   const StaticVector< Dimension, int >& getGridDimensions() const;
-
-   const StaticVector< Dimension, int >& getLowerNeighbors() const;
-
-   const StaticVector< Dimension, Index >& getLowerSubdomainsOverlaps() const;
-
-   const StaticVector< Dimension, int >& getNodeCoordinates() const;
-
-   const StaticVector< Dimension, Index >& getSubdomainDimensions() const;
-
-   const StaticVector< Dimension, Index >& getUpperSubdomainsOverlaps() const;
-
-   const StaticVector< Dimension, int >& getUppperNeighbors() const;
-
-   protected:
-
-   //! Pointer to the communicator used by this distributed grid.
-   tnlCommunicator< Device >* communicator;
-
-   //! In 2D this is the left bottom corner of the global domain.
-   /*!***
-    * This is naturally generalized to more dimensions.
-    */
-   StaticVector< Dimension, Real > domainLowerCorner;
-
-   //! In 2D this is the right top corner of the global domain.
-   /*!***
-    * This is naturally generalized to more dimensions.
-    */
-   StaticVector< Dimension, Real > domainUpperCorner;
-
-   //! Dimension of the global domain.
-   StaticVector< Dimension, Index > globalDimensions;
-
-   //! Dimension of the local subdomain.
-   StaticVector< Dimension, Index > subdomainDimensions;
-
-   //! Number of the distributed grid nodes along each dimension.
-   StaticVector< Dimension, int > gridDimensions;
-
-   //! Coordinates of this node of the distributed grid.
-   StaticVector< Dimension, int > nodeCoordinates;
-
-   //! Here are device IDs taken from the tnlCommunicator.
-   /*!***
-    * In 2D, this is the device ID of the neighbor on the
-    * right and above.
-    */
-   StaticVector< Dimension, int > uppperNeighbors;
-
-   //! Here are device IDs taken from the tnlCommunicator.
-   /*!***
-    * In 2D, this is the device ID of the neighbor on the
-    * left and below.
-    */
-   StaticVector< Dimension, int > lowerNeighbors;
-
-   //! Here are widths of overlaps at subdomain boundaries with neighbors.
-   /*!***
-    * These overlaps are necessary for exchange of data
-    * between neighboring nodes. In 2D, here are overlaps
-    * with the neighbors on the right and above.
-    */
-   StaticVector< Dimension, Index > upperSubdomainsOverlaps;
-
-   //! Here are widths of overlaps at subdomain boundaries with neighbors.
-   /*!***
-    * These overlaps are necessary for exchange of data
-    * between neighboring nodes. In 2D, here are overlaps
-    * with the neighbors on the left and below.
-    */
-   StaticVector< Dimension, Index > lowerSubdomainsOverlaps;
-
-};
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: tnlDistributedGrid( const String& name )
- : Object( name )
-{
-
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-bool tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: init( tnlCommunicator* communicator,
-                                                                              const GridType& grid,
-                                                                              const StaticVector< Dimension, int >& gridDimensions,
-                                                                              const StaticVector< Dimension, Index >& subdomainOverlaps )
-{
-
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-tnlCommunicator* tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getCommunicator() const
-{
-    return communicator;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Real >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getDomainLowerCorner() const
-{
-    return domainLowerCorner;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Real >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getDomainUpperCorner() const
-{
-    return domainUpperCorner;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Index >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getDimensions() const
-{
-    return globalDimensions;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, int >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getGridDimensions() const
-{
-    return gridDimensions;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, int >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getLowerNeighbors() const
-{
-    return lowerNeighbors;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Index >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getLowerSubdomainsOverlaps() const
-{
-    return lowerSubdomainsOverlaps;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, int >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getNodeCoordinates() const
-{
-    return nodeCoordinates;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Index >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getSubdomainDimensions() const
-{
-    return subdomainDimensions;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, Index >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getUpperSubdomainsOverlaps() const
-{
-    return upperSubdomainsOverlaps;
-}
-
-template< int Dimension, typename GridType, typename Device, typename Real, typename Index >
-const StaticVector< Dimension, int >& tnlDistributedGrid< Dimension, GridType, Device, Real, Index > :: getUppperNeighbors() const
-{
-    return uppperNeighbors;
-}
-
-#endif /* TNLDISTRIBUTEDGRID_H_ */
diff --git a/src/TNL/legacy/tnl-benchmarks.cpp b/src/TNL/legacy/tnl-benchmarks.cpp
deleted file mode 100644
index d62fd60f1e858a232b8d61fdf4b441856172ee21..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/tnl-benchmarks.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/***************************************************************************
-                          tnl-benchmarks.cpp  -  description
-                             -------------------
-    begin                : Nov 25, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#include <TNL/TimerRT.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/VectorCUDA.h>
-#include <TNL/tnl-cuda-kernels.cu.h>
-#include <TNL/tnl-benchmarks.h>
-
-
-int main( int argc, char* argv[] )
-{
-   std::cout << "Benchmarking memory bandwidth when transfering int ..." << std::endl;
-
-   const int size = 1 << 22;
-   double host_to_host_band_width;
-   double host_to_device_band_width;
-   double device_to_host_band_width;
-   double device_to_device_band_width;
-
-   transferBenchmark< int >( size,
-                             host_to_host_band_width,
-                             host_to_device_band_width,
-                             device_to_host_band_width,
-                             device_to_device_band_width );
-
-
-   std::cout << "Benchmarking reduction of int ..." << std::endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< int >( size, i );
-
-   std::cout << "Benchmarking reduction of float ..." << std::endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< float >( size, i );
-
-   std::cout << "Benchmarking reduction of double ..." << std::endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< double >( size / 2, i );
-
-   return EXIT_SUCCESS;
-}
diff --git a/src/TNL/legacy/tnl-benchmarks.h b/src/TNL/legacy/tnl-benchmarks.h
deleted file mode 100644
index df9af1cf43231d36e5460cd410b3c7756529ada9..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/tnl-benchmarks.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/***************************************************************************
-                          tnl-benchmarks.h  -  description
-                             -------------------
-    begin                : Jan 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#ifndef TNLBENCHMARKS_H_
-#define TNLBENCHMARKS_H_
-
-#include <TNL/Math.h>
-
-template< class T >
-bool transferBenchmark( const int size,
-                        double& host_to_host_band_width,
-                        double& host_to_device_band_width,
-                        double& device_to_host_band_width,
-                        double& device_to_device_band_width )
-{
-
-  Vector< T > host_vector( "transferBenchmark:host-vector", size );
-  Vector< T > host_vector2( "transferBenchmark:host-vector-2", size );
-  VectorCUDA< T > device_vector( "transferBenchmark:device-vector", size );
-  VectorCUDA< T > device_vector2( "transferBenchmark:device-vector-2", size );
-
-   for( int i = 0; i < size; i ++ )
-      host_vector[ i ] = i + 1;
-
-   const long int cycles = 100;
-   long int bytes = cycles * size * sizeof( int );
-   long int mega_byte = 1 << 20;
-
-   TimerRT timer;
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      if( ! host_vector2. copyFrom( host_vector ) )
-         return false;
-   double time = timer. getTime();
-   double giga_byte = ( double ) ( 1 << 30 );
-   host_to_host_band_width = bytes / giga_byte / time;
-
-  std::cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << std::endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      if( ! device_vector. copyFrom( host_vector ) )
-         return false;
-   time = timer. getTime();
-   host_to_device_band_width = bytes / giga_byte / time;
-
-  std::cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << std::endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      if( ! host_vector2. copyFrom( device_vector ) )
-         return false;
-   time = timer. getTime();
-   device_to_host_band_width = bytes / giga_byte / time;
-
-  std::cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << std::endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      if( ! device_vector2. copyFrom( device_vector ) )
-         return false;
-
-   time = timer. getTime();
-
-   // Since we read and write tha data back we process twice as many bytes.
-   bytes *= 2;
-   device_to_device_band_width = bytes / giga_byte / time;
-
-  std::cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << std::endl;
-}
-
-template< class T >
-void tnlCPUReductionSum( const Vector< T >& host_vector,
-                         T& sum )
-{
-   const T* data = host_vector. Data();
-   const int size = host_vector. GetSize();
-   sum = 0.0;
-   for( int i = 0; i < size; i ++ )
-      sum += data[ i ];
-};
-
-template< class T >
-void tnlCPUReductionMin( const Vector< T >& host_vector,
-                         T& min )
-{
-   const T* data = host_vector. Data();
-   const int size = host_vector. GetSize();
-   //TNL_ASSERT( data );
-   min = data[ 0 ];
-   for( int i = 1; i < size; i ++ )
-      min = :: min( min,  data[ i ] );
-};
-
-template< class T >
-void tnlCPUReductionMax( const Vector< T >& host_vector,
-                         T& max )
-{
-   const T* data = host_vector. Data();
-   const int size = host_vector. GetSize();
-   //TNL_ASSERT( data );
-   max = data[ 0 ];
-   for( int i = 1; i < size; i ++ )
-      max = :: max( max,  data[ i ] );
-};
-
-template< class T >
-void reductionBenchmark( const int size,
-                         const int algorithm )
-{
-   Vector< T > host_vector( "reductionBenchmark:host-vector", size );
-   VectorCUDA< T > device_vector( "reductionBenchmark:device-vector", size );
-   VectorCUDA< T > device_aux( "reductionBenchmark:device-aux", size / 2 );
-
-   for( int i = 0; i < size; i ++ )
-      host_vector[ i ] = i + 1;
-
-   device_vector. copyFrom( host_vector );
-
-   T sum, min, max;
-   const long int reducing_cycles( 10 );
-
-   TimerRT timer;
-   timer. Reset();
-   for( int i = 0; i < reducing_cycles; i ++ )
-   {
-      switch( algorithm )
-      {
-         case 0:  // reduction on CPU
-            tnlCPUReductionSum( host_vector, sum );
-            tnlCPUReductionMin( host_vector, sum );
-            tnlCPUReductionMax( host_vector, sum );
-         case 1:
-            Devices::CudaSimpleReduction1Sum( size,
-                                        device_vector. Data(),
-                                        sum,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction1Min( size,
-                                        device_vector. Data(),
-                                        min,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction1Max( size,
-                                        device_vector. Data(),
-                                        max,
-                                        device_aux. Data() );
-            break;
-         case 2:
-            Devices::CudaSimpleReduction2Sum( size,
-                                        device_vector. Data(),
-                                        sum,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction2Min( size,
-                                        device_vector. Data(),
-                                        min,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction2Max( size,
-                                        device_vector. Data(),
-                                        max,
-                                        device_aux. Data() );
-            break;
-         case 3:
-            Devices::CudaSimpleReduction3Sum( size,
-                                        device_vector. Data(),
-                                        sum,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction3Min( size,
-                                        device_vector. Data(),
-                                        min,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction3Max( size,
-                                        device_vector. Data(),
-                                        max,
-                                        device_aux. Data() );
-            break;
-         case 4:
-            Devices::CudaSimpleReduction4Sum( size,
-                                        device_vector. Data(),
-                                        sum,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction4Min( size,
-                                        device_vector. Data(),
-                                        min,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction4Max( size,
-                                        device_vector. Data(),
-                                        max,
-                                        device_aux. Data() );
-            break;
-         case 5:
-            Devices::CudaSimpleReduction5Sum( size,
-                                        device_vector. Data(),
-                                        sum,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction5Min( size,
-                                        device_vector. Data(),
-                                        min,
-                                        device_aux. Data() );
-            Devices::CudaSimpleReduction5Max( size,
-                                        device_vector. Data(),
-                                        max,
-                                        device_aux. Data() );
-            break;
-         default:
-            CudaReductionSum( size,
-                                 device_vector. Data(),
-                                 sum,
-                                 device_aux. Data() );
-            CudaReductionMin( size,
-                                 device_vector. Data(),
-                                 min,
-                                 device_aux. Data() );
-            CudaReductionMax( size,
-                                 device_vector. Data(),
-                                 max,
-                                 device_aux. Data() );
-
-      }
-   }
-   const double time = timer. getTime();
-   double giga_byte = ( double ) ( 1 << 30 );
-   long int mega_byte = 1 << 20;
-   long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
-   const double reduction_band_width = bytes_reduced / giga_byte / time;
-
-  std::cout << "Reducing " << bytes_reduced / mega_byte
-        << " MB on DEVICE using algorithm " << algorithm
-        << " took " << time
-        << " seconds. Bandwidth is " << reduction_band_width
-        << " GB/s." << std::endl;
-}
-
-#endif /* TNLBENCHMARKS_H_ */
diff --git a/src/TNL/legacy/tnlMatrix_impl.h b/src/TNL/legacy/tnlMatrix_impl.h
deleted file mode 100644
index 558f18ab16fba204b2bd9f118617bdae74ce60e9..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/tnlMatrix_impl.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/***************************************************************************
-                          tnlMatrix_impl.h  -  description
-                             -------------------
-    begin                : Dec 18, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLMATRIX_IMPL_H_
-#define TNLMATRIX_IMPL_H_
-
-#include <matrices/tnlMatrix.h>
-#include <core/TNL_ASSERT.h>
-
-template< typename Real,
-          typename Device,
-          typename Index >
-tnlMatrix< Real, Device, Index >::tnlMatrix()
-: rows( 0 ),
-  columns( 0 ),
-  numberOfColors( 0 )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
- bool tnlMatrix< Real, Device, Index >::setDimensions( const IndexType rows,
-                                                       const IndexType columns )
-{
-   TNL_ASSERT( rows > 0 && columns > 0,
-           std::cerr << " rows = " << rows << " columns = " << columns );
-   this->rows = rows;
-   this->columns = columns;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlMatrix< Real, Device, Index >::getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths ) const
-{
-   rowLengths.setSize( this->getRows() );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      rowLengths.setElement( row, this->getRowLength( row ) );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool tnlMatrix< Real, Device, Index >::setLike( const tnlMatrix< Real2, Device2, Index2 >& matrix )
-{
-   return setDimensions( matrix.getRows(), matrix.getColumns() );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Index tnlMatrix< Real, Device, Index >::getRows() const
-{
-   return this->rows;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Index tnlMatrix< Real, Device, Index >::getColumns() const
-{
-   return this->columns;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
-Index tnlMatrix< Real, Device, Index >::getNumberOfColors() const
-{
-    return this->numberOfColors;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlMatrix< Real, Device, Index >::reset()
-{
-   this->rows = 0;
-   this->columns = 0;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool tnlMatrix< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                 const CompressedRowLengthsVector& rowLengths )
-{
-   /*tnlStaticAssert( DeviceType::DeviceType == Devices::HostDevice, );
-   tnlStaticAssert( DeviceType::DeviceType == Matrix:DeviceType::DeviceType, );*/
-
-   this->setLike( matrix );
-   if( ! this->setCompressedRowLengths( rowLengths ) )
-      return false;
-   Containers::Vector< RealType, Devices::Host, IndexType > values;
-   Containers::Vector< IndexType, Devices::Host, IndexType > columns;
-   if( ! values.setSize( this->getColumns() ) ||
-       ! columns.setSize( this->getColumns() ) )
-      return false;
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      matrix.getRow( row, columns.getData(), values.getData() );
-      this->setRow( row, columns.getData(), values.getData(), rowLengths.getElement( row ) );
-   }
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-tnlMatrix< Real, Device, Index >& tnlMatrix< Real, Device, Index >::operator = ( const tnlMatrix< RealType, DeviceType, IndexType >& m )
-{
-   this->setLike( m );
-
-   Containers::Vector< IndexType, DeviceType, IndexType > rowLengths;
-   m.getRowLengths( rowLengths );
-   this->setCompressedRowLengths( rowLengths );
-
-   Containers::Vector< RealType, DeviceType, IndexType > rowValues;
-   Containers::Vector< IndexType, DeviceType, IndexType > rowColumns;
-   const IndexType maxRowLength = rowLengths.max();
-   rowValues.setSize( maxRowLength );
-   rowColumns.setSize( maxRowLength );
-   for( IndexType row = 0; row < this->getRows(); row++ )
-   {
-      m.getRow( row,
-                rowColumns.getData(),
-                rowValues.getData() );
-      this->setRow( row,
-                    rowColumns.getData(),
-                    rowValues.getData(),
-                    m.getRowLength( row ) );
-   }
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool tnlMatrix< Real, Device, Index >::operator == ( const Matrix& matrix ) const
-{
-   if( this->getRows() != matrix.getRows() ||
-       this->getColumns() != matrix.getColumns() )
-      return false;
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      for( IndexType column = 0; column < this->getColumns(); column++ )
-         if( this->getElement( row, column ) != matrix.getElement( row, column ) )
-            return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Matrix >
-bool tnlMatrix< Real, Device, Index >::operator != ( const Matrix& matrix ) const
-{
-   return ! operator == ( matrix );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool tnlMatrix< Real, Device, Index >::save( File& file ) const
-{
-   if( ! tnlObject::save( file ) ||
-       ! file.write( &this->rows ) ||
-       ! file.write( &this->columns ) ||
-       ! this->values.save( file ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool tnlMatrix< Real, Device, Index >::load( File& file )
-{
-   if( ! tnlObject::load( file ) ||
-       ! file.read( &this->rows ) ||
-       ! file.read( &this->columns ) ||
-       ! this->values.load( file ) )
-      return false;
-   return true;
-}
-
-/*
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlMatrix< Real, Device, Index >::computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector)
-{
-   this->numberOfColors = 0;
-
-   for( IndexType i = this->getRows() - 1; i >= 0; i-- )
-   {
-      // init color array
-      Containers::Vector< Index, Device, Index > usedColors;
-      usedColors.setSize( this->numberOfColors );
-      for( IndexType j = 0; j < this->numberOfColors; j++ )
-         usedColors.setElement( j, 0 );
-
-      // find all colors used in given row
-
-   }
-}
- */
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void tnlMatrix< Real, Device, Index >::print( ostream& str ) const
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool tnlMatrix< Real, Device, Index >::help( bool verbose )
-{
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-__cuda_callable__
-void tnlMatrix< Real, Device, Index >::copyFromHostToCuda( tnlMatrix< Real, Devices::Host, Index >& matrix )
-{
-    this->numberOfColors = matrix.getNumberOfColors();
-    this->columns = matrix.getColumns();
-    this->rows = matrix.getRows();
-
-    this->values.setSize( matrix.getValuesSize() );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index tnlMatrix< Real, Device, Index >::getValuesSize() const
-{
-    return this->values.getSize();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-  __device__ __host__
-#endif
-void tnlMatrix< Real, Device, Index >::computeColorsVector(Containers::Vector<Index, Device, Index> &colorsVector)
-{
-    for( IndexType i = this->getRows() - 1; i >= 0; i-- )
-    {
-        // init color array
-        Containers::Vector< Index, Device, Index > usedColors;
-        usedColors.setSize( this->numberOfColors );
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            usedColors.setElement( j, 0 );
-
-        // find all colors used in given row
-        for( IndexType j = i + 1; j < this->getColumns(); j++ )
-             if( this->getElement( i, j ) != 0.0 )
-                 usedColors.setElement( colorsVector.getElement( j ), 1 );
-
-        // find unused color
-        bool found = false;
-        for( IndexType j = 0; j < this->numberOfColors; j++ )
-            if( usedColors.getElement( j ) == 0 )
-            {
-                colorsVector.setElement( i, j );
-                found = true;
-                break;
-            }
-        if( !found )
-        {
-            colorsVector.setElement( i, this->numberOfColors );
-            this->numberOfColors++;
-        }
-    }
-}
-
-#ifdef HAVE_CUDA
-template< typename Matrix,
-          typename InVector,
-          typename OutVector >
-__global__ void tnlMatrixVectorProductCudaKernel( const Matrix* matrix,
-                                                  const InVector* inVector,
-                                                  OutVector* outVector,
-                                                  int gridIdx )
-{
-   tnlStaticAssert( Matrix::DeviceType::DeviceType == tnlCudaDevice, );
-   const typename Matrix::IndexType rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( rowIdx < matrix->getRows() )
-      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
-}
-#endif
-
-template< typename Matrix,
-          typename InVector,
-          typename OutVector >
-void tnlMatrixVectorProductCuda( const Matrix& matrix,
-                                 const InVector& inVector,
-                                 OutVector& outVector )
-{
-#ifdef HAVE_CUDA
-   typedef typename Matrix::IndexType IndexType;
-   Matrix* kernel_this = tnlCuda::passToDevice( matrix );
-   InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
-   OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
-   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
-   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-   {
-      if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
-      tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
-                                     ( kernel_this,
-                                       kernel_inVector,
-                                       kernel_outVector,
-                                       gridIdx );
-   }
-   tnlCuda::freeFromDevice( kernel_this );
-   tnlCuda::freeFromDevice( kernel_inVector );
-   tnlCuda::freeFromDevice( kernel_outVector );
-   TNL_CHECK_CUDA_DEVICE;
-#endif
-}
-
-#endif /* TNLMATRIX_IMPL_H_ */
diff --git a/src/TNL/legacy/tnlSparseMatrix_impl.h b/src/TNL/legacy/tnlSparseMatrix_impl.h
deleted file mode 100644
index 5b4b3ddc1c53878c4a73f3eaadb6d63698e0a82e..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/tnlSparseMatrix_impl.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/***************************************************************************
-                          SparseMatrix_impl.h  -  description
-                             -------------------
-    begin                : Dec 21, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SparseMATRIX_IMPL_H_
-#define SparseMATRIX_IMPL_H_
-
-template< typename Real,
-          typename Device,
-          typename Index >
-SparseMatrix< Real, Device, Index >::SparseMatrix()
-: maxRowLength( 0 )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Real2,
-             typename Device2,
-             typename Index2 >
-bool SparseMatrix< Real, Device, Index >::setLike( const SparseMatrix< Real2, Device2, Index2 >& matrix )
-{
-   if( ! tnlMatrix< Real, Device, Index >::setLike( matrix ) ||
-       ! this->allocateMatrixElements( matrix.getNumberOfMatrixElements() ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index SparseMatrix< Real, Device, Index >::getNumberOfMatrixElements() const
-{
-   return this->values.getSize();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index SparseMatrix< Real, Device, Index >::getNumberOfNonzeroMatrixElements() const
-{
-   IndexType nonzeroElements( 0 );
-   for( IndexType i = 0; i < this->values.getSize(); i++ )
-      if( this->columnIndexes.getElement( i ) != this-> columns &&
-          this->values.getElement( i ) != 0.0 )
-         nonzeroElements++;
-   return nonzeroElements;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Index
-SparseMatrix< Real, Device, Index >::
-getMaxRowLength() const
-{
-   return this->maxRowLength;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Index SparseMatrix< Real, Device, Index >::getPaddingIndex() const
-{
-   return this->getColumns();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void SparseMatrix< Real, Device, Index >::reset()
-{
-   tnlMatrix< Real, Device, Index >::reset();
-   this->values.reset();
-   this->columnIndexes.reset();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool SparseMatrix< Real, Device, Index >::save( File& file ) const
-{
-   if( ! tnlMatrix< Real, Device, Index >::save( file ) ||
-       ! this->values.save( file ) ||
-       ! this->columnIndexes.save( file ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool SparseMatrix< Real, Device, Index >::load( File& file )
-{
-   if( ! tnlMatrix< Real, Device, Index >::load( file ) ||
-       ! this->values.load( file ) ||
-       ! this->columnIndexes.load( file ) )
-      return false;
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-  __device__ __host__
-#endif
-Containers::Vector< Index, Device, Index > SparseMatrix< Real, Device, Index >::getColumnIndexes()
-{
-    return this->columnIndexes;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-#ifdef HAVE_CUDA
-  __device__ __host__
-#endif
-void SparseMatrix< Real, Device, Index >::copyFromHostToCuda( SparseMatrix< Real, Devices::Host, Index >& matrix )
-{
-    tnlMatrix< Real, Device, Index >::copyFromHostToCuda( matrix );
-
-    this->columnIndexes.setSize( matrix.getValuesSize() );
-    this->columnIndexes.setValue( this->getPaddingIndex() );
-    this->maxRowLength = matrix.getMaxRowLength();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-bool SparseMatrix< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
-{
-   if( ! this->values.setSize( numberOfMatrixElements ) ||
-       ! this->columnIndexes.setSize( numberOfMatrixElements ) )
-      return false;
-
-   /****
-    * Setting a column index to this->columns means that the
-    * index is undefined.
-    */
-   this->columnIndexes.setValue( this->columns );
-   return true;
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-void SparseMatrix< Real, Device, Index >::printStructure( ostream& str ) const
-{
-}
-
-#endif /* SparseMATRIX_IMPL_H_ */
diff --git a/src/TNL/legacy/vdb/TODO b/src/TNL/legacy/vdb/TODO
deleted file mode 100755
index 17eb48a8144cd72046b207e49ea4d79dc757fb83..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-echo
-echo
-echo "Add integration to tnl library -- that means: templates for"
-echo "Device types. Add updateTree method. Add Values class to store "
-echo "interpoled values in each node. Compare speed with original"
-echo "VDB. Implement on CUDA. Implement 3D version (should be trivial)."
-echo
diff --git a/src/TNL/legacy/vdb/draw.py b/src/TNL/legacy/vdb/draw.py
deleted file mode 100644
index 4df7253b5934595266c9bd440678b1e2067e7daf..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/draw.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import re
-from pyx import *
-
-filename = "nodesLevel_"
-depth = 5
-colors = [
-    color.cmyk.Yellow,
-    color.rgb.green,
-    color.rgb.blue,
-    color.rgb.red,
-    color.rgb.black
-]
-c = canvas.canvas()
-for i in range(depth):
-    with open(filename + str(i), 'r') as f:
-        lines = f.readlines()
-    getnumbers = re.compile(r"\d+")
-    aux = getnumbers.findall(lines[0])
-    region = {"x1": int(aux[0]),
-              "x2": int(aux[1]),
-              "y1": int(aux[2]),
-              "y2": int(aux[3]),
-              "level": int(aux[4])
-              }
-    aux = getnumbers.findall(lines[1])
-    splitting = {"splitx": int(aux[0]),
-                 "splity": int(aux[1]),
-                 "logx": int(aux[2]),
-                 "logy": int(aux[3])
-                 }
-    states = []
-    for j in range(3, len(lines)):
-        aux = getnumbers.findall(lines[j])
-        states.append(
-                     {"x": int(aux[0]),
-                      "y": int(aux[1]),
-                      "state": int(aux[2])}
-                     )
-    lengthx = region.get("x2") - region.get("x1")
-    rectsx = splitting.get("splitx") * \
-             (splitting.get("logx") ** region.get("level"))
-    stepx = lengthx / rectsx
-    lengthy = region.get("y2") - region.get("y1")
-    rectsy = splitting.get("splity") * \
-             (splitting.get("logy") ** region.get("level"))
-    stepy = lengthy / rectsy
-    print(str(stepx))
-    print(str(stepy))
-    for state in states:
-        if state.get("state") and i < depth - 1:
-            c.stroke(path.rect(state.get("x") * stepx, 
-                               state.get("y") * stepy, 
-                               stepx,
-                               stepy), 
-                     [deco.filled([colors[i]])])
-        elif i == 0:
-            c.stroke(path.rect(state.get("x") * stepx,
-                               state.get("y") * stepy,
-                               stepx,
-                               stepy),
-                     [deco.filled([color.rgb.white])])
-        elif i == depth - 1:
-            c.fill(path.rect(state.get("x") * stepx,
-                             state.get("y") * stepy,
-                             stepx,
-                             stepy),
-                   [deco.filled([color.rgb.black])])
-c.writePDFfile(filename) 
-
diff --git a/src/TNL/legacy/vdb/make b/src/TNL/legacy/vdb/make
deleted file mode 100755
index a169859cce3df7d3d82034e026e9ebb2e582247c..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/make
+++ /dev/null
@@ -1 +0,0 @@
-g++ -O1 -g -Wall -std=c++11 -Werror tnlInternalNode.h tnlInternalNode_impl.h tnlLeafNode.h tnlLeafNode_impl.h tnlNode.h tnlNode_impl.h tnlVDBMath.h tnlRootNode_test.cpp tnlRootNode.h tnlRootNode_impl.h tnlArea2D.h tnlArea2D_impl.h tnlCircle2D.h tnlCircle2D_impl.h tnlBitmaskArray.h tnlBitmaskArray_impl.h tnlBitmask.h tnlBitmask_impl.h -o test
diff --git a/src/TNL/legacy/vdb/tnlArea2D.h b/src/TNL/legacy/vdb/tnlArea2D.h
deleted file mode 100644
index 70f0416076dab21378b291829b4b96c4f5b670cf..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlArea2D.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _TNLAREA2D_H_INCLUDED_
-#define _TNLAREA2D_H_INCLUDED_
-
-template< typename Real >
-class tnlArea2D
-{
-public:
-    tnlArea2D( Real startX,
-               Real endX,
-               Real startY,
-               Real endY );
-
-    Real getStartX();
-
-    Real getEndX();
-
-    Real getLengthX();
-
-    Real getStartY();
-
-    Real getEndY();
-
-    Real getLengthY();
-
-    ~tnlArea2D(){};
-
-private:
-    Real startX;
-    Real endX;
-    Real startY;
-    Real endY;
-};
-
-#include "tnlArea2D_impl.h"
-#endif // _TNLAREA2D_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlArea2D_impl.h b/src/TNL/legacy/vdb/tnlArea2D_impl.h
deleted file mode 100644
index 7a7c2e7fa885ecf3d96e02eba54ff4cd6e5722c7..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlArea2D_impl.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _TNLAREA2D_IMPL_H_INCLUDED_
-#define _TNLAREA2D_IMPL_H_INCLUDED_
-
-#include "tnlArea2D.h"
-
-template< typename Real >
-tnlArea2D< Real >::tnlArea2D( Real startX,
-                              Real endX,
-                              Real startY,
-                              Real endY )
-{
-    this->startX = startX;
-    this->endX = endX;
-    this->startY = startY;
-    this->endY = endY;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getStartX()
-{
-    return this->startX;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getEndX()
-{
-    return this->endX;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getLengthX()
-{
-    return this->endX - this->startX;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getStartY()
-{
-    return this->startY;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getEndY()
-{
-    return this->endY;
-}
-
-template< typename Real >
-Real tnlArea2D< Real >::getLengthY()
-{
-    return this->endY - this->startY;
-}
-
-#endif // _TNLAREA2D_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlBitmask.h b/src/TNL/legacy/vdb/tnlBitmask.h
deleted file mode 100644
index 0c69362ec7d5dc5a4454b85c2564164243843f09..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlBitmask.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _TNLBITMASK_H_INCLUDED_
-#define _TNLBITMASK_H_INCLUDED_
-
-#include <cstdint>
-
-class tnlBitmask
-{
-public:
-    tnlBitmask( bool state, unsigned x, unsigned y );
-
-    tnlBitmask( tnlBitmask* bitmask );
-    
-    bool getState();
-    
-    unsigned getX();
-    
-    unsigned getY();
-    
-    uint64_t getBitmask();
-
-    ~tnlBitmask(){};
-    
-private:
-    uint64_t bitmask;
-};
-
-#include "tnlBitmask_impl.h"
-#endif //_TNLBITMASK_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlBitmaskArray.h b/src/TNL/legacy/vdb/tnlBitmaskArray.h
deleted file mode 100644
index c54e22b0ce9e7f97c8e08c41ba48a74f980d94b2..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlBitmaskArray.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _TNLBITMASKARRAY_H_INCLUDED_
-#define _TNLBITMASKARRAY_H_INCLUDED_
-
-#include "tnlBitmask.h"
-
-template< unsigned Size >
-class tnlBitmaskArray
-{
-public:
-    tnlBitmaskArray();
-
-    unsigned getSize();
-
-    void setIthBitmask( unsigned i,
-                        tnlBitmask bitmask );
-
-    tnlBitmask* getIthBitmask( unsigned i );
-
-    ~tnlBitmaskArray();
-
-private:
-    tnlBitmask* bitmaskArray[ Size ];
-    unsigned length;
-};
-
-#include "tnlBitmaskArray_impl.h"
-#endif // _TNLBITMASKARRAY_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlBitmaskArray_impl.h b/src/TNL/legacy/vdb/tnlBitmaskArray_impl.h
deleted file mode 100644
index 829e184ae36d7666281d26bbd6b58107e75473c4..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlBitmaskArray_impl.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _TNLBITMASKARRAY_IMPL_H_INCLUDED_
-#define _TNLBITMASKARRAY_IMPL_H_INCLUDED_
-
-#include <cassert>
-#include "tnlBitmask.h"
-#include "tnlBitmaskArray.h"
-
-template< unsigned Size >
-tnlBitmaskArray< Size >::tnlBitmaskArray()
-{
-    this->length = Size;
-}
-
-template< unsigned Size >
-unsigned tnlBitmaskArray< Size >::getSize()
-{
-    return this->length;
-}
-
-template< unsigned Size >
-void tnlBitmaskArray< Size >::setIthBitmask( unsigned i,
-                                             tnlBitmask bitmask )
-{
-    assert( i < Size );
-    this->bitmaskArray[ i ] = new tnlBitmask( bitmask );
-}
-
-template< unsigned Size >
-tnlBitmask* tnlBitmaskArray< Size >::getIthBitmask( unsigned i )
-{
-    assert( i < Size );
-    return this->bitmaskArray[ i ];
-}
-
-template< unsigned Size >
-tnlBitmaskArray< Size >::~tnlBitmaskArray()
-{
-    for( int i = 0; i < this->length; i++ )
-        delete this->bitmaskArray[ i ];
-    delete this->bitmaskArray;
-}
-
-#endif // _TNLBITMASKARRAY_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlBitmask_impl.h b/src/TNL/legacy/vdb/tnlBitmask_impl.h
deleted file mode 100644
index 7deea649abdb9d9509a0cc89917327856f1385ff..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlBitmask_impl.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _TNLBITMASK_IMPL_H_INCLUDED_
-#define _TNLBITMASK_IMPL_H_INCLUDED_
-
-#include <iostream>
-#include <cstdint>
-#include "tnlBitmask.h"
-
-using namespace std;
-
-tnlBitmask::tnlBitmask( bool state,
-                        unsigned x,
-                        unsigned y )
-/*
-  variables x and y have at most 30 active bits
-*/                        
-{
-    uint64_t state64 = state;
-    uint64_t x64 = x;
-    x64 <<= 4;
-    uint64_t y64 = y;
-    y64 <<= 34;
-    this->bitmask = x64 | y64 | state64;
-}                        
-
-tnlBitmask::tnlBitmask( tnlBitmask* bitmask )
-{
-    this->bitmask = bitmask->getBitmask();
-}
-
-bool tnlBitmask::getState()
-{
-    return this->bitmask & 1;
-}
-
-unsigned tnlBitmask::getX()
-{
-    unsigned mask = 3 << 30;
-    unsigned x = this->bitmask >> 4;
-    return ( unsigned ) ( x & ( ~mask ) );
-}
-
-unsigned tnlBitmask::getY()
-{
-    unsigned mask = 3 << 30;
-    uint64_t y = this->bitmask >> 34;
-    return ( unsigned ) ( y & ( ~mask ) );
-}
-
-uint64_t tnlBitmask::getBitmask()
-{
-    return this->bitmask;
-}
-
-#endif //_TNLBITMASK_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlCircle2D.h b/src/TNL/legacy/vdb/tnlCircle2D.h
deleted file mode 100644
index d40393f7941d4af0e2abcffc092449b55bee6a3c..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlCircle2D.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _TNLCIRCLE2D_H_INCLUDED_
-#define _TNLCIRCLE2D_H_INCLUDED_
-
-template< typename Real >
-class tnlCircle2D
-{
-public:
-    tnlCircle2D( unsigned a,
-                 unsigned b,
-                 unsigned r );
-
-    bool isIntercept( Real x1,
-                      Real x2,
-                      Real y1,
-                      Real y2,
-                      bool verbose = false );
-
-    bool isInInterval( Real x1,
-                       Real x2,
-                       Real x );
-
-    ~tnlCircle2D();
-
-private:
-    // x and y define center of the circle
-    // r defines its radius
-    unsigned a;
-    unsigned b;
-    unsigned r;
-};
-
-#include "tnlCircle2D_impl.h"
-#endif // _TNLCIRCLE2D_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlCircle2D_impl.h b/src/TNL/legacy/vdb/tnlCircle2D_impl.h
deleted file mode 100644
index a0871726d89914ed179acd2a635c3448395a7fca..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlCircle2D_impl.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef _TNLCIRCLE2D_IMPL_H_INCLUDED_
-#define _TNLCIRCLE2D_IMPL_H_INCLUDED_
-
-#include <iostream>
-#include <cmath>
-#include "tnlCircle2D.h"
-
-template< typename Real >
-tnlCircle2D< Real >::tnlCircle2D( unsigned a,
-                                  unsigned b,
-                                  unsigned r )
-{
-    this->a = a;
-    this->b = b;
-    this->r = r;
-}
-
-template< typename Real >
-bool tnlCircle2D< Real >::isIntercept( Real x1,
-                                       Real x2,
-                                       Real y1,
-                                       Real y2,
-                                       bool verbose )
-{
-    if( this->isInInterval( x1, x2, this->a - this->r ) &&
-        this->isInInterval( x1, x2, this->a + this->r ) &&
-        this->isInInterval( y1, y2, this->b - this->r ) &&
-        this->isInInterval( y1, y2, this->b + this->r ) )
-    {
-        if( verbose )
-            std::cout << "Circle is inside area." << std::endl;
-        return true;
-    }
-    else if( verbose )
-        std::cout << "Circle is not inside area." << std::endl;
-
-    Real R = this->r * this->r;
-
-    Real aux = x1 - this->a;
-    if( R - aux * aux >= 0 &&
-        ( this->isInInterval( y1, y2, sqrt( R - aux * aux ) + this->b ) ||
-        this->isInInterval( y1, y2, -sqrt( R - aux * aux ) + this->b ) ) )
-    {
-        if( verbose )
-            std::cout << "Circle intercepts left boundry of area." << std::endl;
-        return true;
-    }
-    
-    aux = x2 - this->a;
-    if( R - aux * aux >= 0 &&
-        ( this->isInInterval( y1, y2, sqrt( R - aux * aux ) + this->b ) ||
-        this->isInInterval( y1, y2, -sqrt( R - aux * aux ) + this->b ) ) )
-    {
-        if( verbose )
-            std::cout << "Circle intercepts right boundry of area." << std::endl;
-        return true;
-    }
-
-    aux = y1 - this->b;
-    if( R - aux * aux >= 0 &&
-        ( this->isInInterval( x1, x2, sqrt( R - aux * aux ) + this->a ) ||
-        this->isInInterval( x1, x2, -sqrt( R - aux * aux ) + this->a ) ) )
-    {
-        if( verbose )
-            std::cout << "Circle intercepts bottom boundry of area." << std::endl;
-        return true;
-    }
-
-    aux = y2 - this->b;
-    if( R - aux * aux >= 0 &&
-        ( this->isInInterval( x1, x2, sqrt( R - aux * aux ) + this->a ) ||
-        this->isInInterval( x1, x2, sqrt( R - aux * aux ) + this->a ) ) )
-    {
-        if( verbose )
-            std::cout << "Circle intercepts top boundry of area." << std::endl;
-        return true;
-    }
-
-    if( verbose )
-        std::cout << "Circle does not intercept area." << std::endl;
-
-    return false;
-}
-
-template< typename Real >
-bool tnlCircle2D< Real >::isInInterval( Real x1,
-                                        Real x2,
-                                        Real x )
-{
-    return ( ( x1 <= x ) and ( x <= x2 ) );
-}
-
-#endif // _TNLCIRCLE2D_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlInternalNode.h b/src/TNL/legacy/vdb/tnlInternalNode.h
deleted file mode 100644
index 4281c303a016f769d1556b12383fd628fcf726a4..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlInternalNode.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _TNLINTERNALNODE_H_INCLUDED_
-#define _TNLINTERNALNODE_H_INCLUDED_
-
-#include "tnlNode.h"
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY = LogX >
-class tnlInternalNode : public tnlNode< Real, Index, LogX, LogY >
-{
-public:
-    tnlInternalNode( tnlArea2D< Real >* area,
-                     tnlCircle2D< Real >* circle,
-                     Index X,
-                     Index Y,
-                     Index level );
-
-    void setNode( Index splitX,
-                  Index splitY,
-                  Index depth );
-
-    void setChildren( Index splitX,
-                      Index splitY,
-                      Index depth );
-
-    void write( fstream& f,
-                Index level );
-
-    ~tnlInternalNode();
-
-private:
-    tnlBitmaskArray< LogX * LogY >* bitmaskArray;
-    tnlNode< Real, Index, LogX, LogY >* children[ LogX * LogY ];
-};
-
-
-#include "tnlInternalNode_impl.h"
-#endif // _TNLINTERNALNODE_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlInternalNode_impl.h b/src/TNL/legacy/vdb/tnlInternalNode_impl.h
deleted file mode 100644
index 63bdd40e41a4f29e7af689a95b49bda22ed90176..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlInternalNode_impl.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _TNLINTERNALNODE_IMPL_H_INCLUDED_
-#define _TNLINTERNALNODE_IMPL_H_INCLUDED_
-
-#include <iostream>
-#include <iomanip>
-#include "tnlInternalNode.h"
-#include "tnlLeafNode.h"
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlInternalNode< Real, Index, LogX, LogY >::tnlInternalNode( tnlArea2D< Real >* area,
-                                                             tnlCircle2D< Real >* circle,
-                                                             Index X,
-                                                             Index Y,
-                                                             Index level )
-: tnlNode< Real, Index, LogX, LogY >::tnlNode( area, circle, X, Y, level )
-{
-    this->bitmaskArray = new tnlBitmaskArray< LogX * LogY >();
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlInternalNode< Real, Index, LogX, LogY >::setNode( Index splitX,
-                                                          Index splitY,
-                                                          Index depth )
-{
-    tnlNode< Real, Index, LogX, LogY >::setNode( splitX, splitY, this->bitmaskArray );
-    this->setChildren( splitX, splitY, depth );
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlInternalNode< Real, Index, LogX, LogY >::setChildren( Index splitX,
-                                                              Index splitY,
-                                                              Index depth )
-{
-    for( Index i = 0; i < LogY; i++ )
-        for( Index j = 0; j < LogX; j++ )
-        {
-            Index index = i * LogY + j;
-            if( !this->bitmaskArray->getIthBitmask( index )->getState() )
-                this->children[ index ] = NULL;
-            else if( this->level < depth - 1 )
-            {
-                //std::cout << "creating new node, level = " << this->level << std::endl;
-                Index X = this->X * LogX + j;
-                Index Y = this->Y * LogY + i;
-                this->children[ index ] = new tnlInternalNode< Real, Index, LogX, LogY >( this->area,
-                                                                             this->circle,
-                                                                             X,
-                                                                             Y,
-                                                                             this->level + 1 );
-                this->children[ index ]->setNode( splitX, splitY, depth );
-            }
-            else
-            {
-                Index X = this->X * LogX + j;
-                Index Y = this->Y * LogY + i;
-                this->children[ index ] = new tnlLeafNode< Real, Index, LogX, LogY >( this->area,
-                                                                         this->circle,
-                                                                         X,
-                                                                         Y,
-                                                                         this->level + 1 );
-                this->children[ index ]->setNode( splitX, splitY, depth );
-            }    
-        }
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlInternalNode< Real, Index, LogX, LogY >::write( fstream& file,
-                                                        Index level )
-{
-    for( Index i = 0; i < LogX * LogY; i++ )
-    {
-        if( this->level == level )
-        {
-            Index x = this->bitmaskArray->getIthBitmask( i )->getX();
-            Index y = this->bitmaskArray->getIthBitmask( i )->getY();
-            bool state = this->bitmaskArray->getIthBitmask( i )->getState();
-            file << "x=" << setw( 10 ) << x
-                 << ", y=" << setw( 10 ) << y
-                 << ", state=" << setw( 1 ) << state
-                 << std::endl;
-        }
-        else if( this->children[ i ] )
-            this->children[ i ]->write( file, level );
-    }
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlInternalNode< Real, Index, LogX, LogY >::~tnlInternalNode()
-{
-    delete this->bitmaskArray;
-    for( Index i = 0; i < LogX * LogY; i++ )
-        delete this->children[ i ];
-    delete [] this->children;
-}
-
-
-#endif // _TNLINTERNALNODE_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlLeafNode.h b/src/TNL/legacy/vdb/tnlLeafNode.h
deleted file mode 100644
index 6d24efe83be24fc3ec96afcb72981f9ae0f4844f..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlLeafNode.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _TNLLEAFNODE_H_INCLUDED_
-#define _TNLLEAFNODE_H_INCLUDED_
-
-#include <fstream>
-#include "tnlNode.h"
-#include "tnlArea2D.h"
-#include "tnlCircle2D.h"
-#include "tnlBitmask.h"
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY = LogX >
-class tnlLeafNode : public tnlNode< Real, Index, LogX, LogY >
-{
-public:
-    tnlLeafNode( tnlArea2D< Real >* area,
-                 tnlCircle2D< Real >* circle,
-                 Index X,
-                 Index Y,
-                 Index level );
-
-    void setNode( Index splitX,
-                  Index splitY,
-                  Index depth );
-
-    void write( fstream& file,
-                Index level );
-
-    ~tnlLeafNode();
-
-private:
-    tnlBitmaskArray< LogX * LogY >* bitmaskArray;
-};
-
-#include "tnlLeafNode_impl.h"
-#endif // _TNLLEAFNODE_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlLeafNode_impl.h b/src/TNL/legacy/vdb/tnlLeafNode_impl.h
deleted file mode 100644
index 1b228ee559f7a4f0d93afd6617c2b9e6ce048e8b..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlLeafNode_impl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _TNLLEAFNODE_IMPL_H_INCLUDED_
-#define _TNLLEAFNODE_IMPL_H_INCLUDED_
-
-#include "tnlLeafNode.h"
-#include <iostream>
-#include <iomanip>
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlLeafNode< Real, Index, LogX, LogY >::tnlLeafNode( tnlArea2D< Real >* area,
-                                                     tnlCircle2D< Real >* circle,
-                                                     Index X,
-                                                     Index Y,
-                                                     Index level )
-: tnlNode< Real, Index, LogX, LogY >::tnlNode( area, circle, X, Y, level )
-{
-    this->bitmaskArray = new tnlBitmaskArray< LogX * LogY >();
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlLeafNode< Real, Index, LogX, LogY >::setNode( Index splitX,
-                                                      Index splitY,
-                                                      Index depth )
-{
-    tnlNode< Real, Index, LogX, LogY >::setNode( splitX, splitY, this->bitmaskArray );
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlLeafNode< Real, Index, LogX, LogY >::write( fstream& file,
-                                                    Index level )
-{
-    for( Index i = 0; i < LogX * LogY; i++ )
-    {
-        Index x = this->bitmaskArray->getIthBitmask( i )->getX();
-        Index y = this->bitmaskArray->getIthBitmask( i )->getY();
-        bool state = this->bitmaskArray->getIthBitmask( i )->getState();
-        file << "x=" << setw( 10 ) << x
-             << ", y=" << setw( 10 ) << y
-             << ", state=" << setw( 1 ) << state
-             << std::endl;
-    }
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlLeafNode< Real, Index, LogX, LogY >::~tnlLeafNode()
-{
-    delete this->bitmaskArray;
-}
-
-#endif // _TNLLEAFNODE_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlNode.h b/src/TNL/legacy/vdb/tnlNode.h
deleted file mode 100644
index 7910e4bbeec976924b7b6a008765a695ff2b8ac0..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlNode.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef _TNLNODE_H_INCLUDED_
-#define _TNLNODE_H_INCLUDED_
-
-#include "tnlBitmaskArray.h"
-#include "tnlArea2D.h"
-#include "tnlCircle2D.h"
-#include <fstream>
-
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY = LogX >
-class tnlNode
-{
-public:
-    tnlNode( tnlArea2D< Real >* area,
-             tnlCircle2D< Real >* circle,
-             Index X,
-             Index Y,
-             Index level );
-
-    void setNode( Index splitX,
-                  Index splitY,
-                  tnlBitmaskArray< LogX * LogY >* bitmaskArray );
-
-    virtual void setNode( Index splitX = 0,
-                          Index splitY = 0,
-                          Index depth = 0 ){};
-
-    virtual void write( fstream& f,
-                        Index level ){};
-
-    Index getLevel();
-
-    ~tnlNode();
-
-protected:
-    tnlArea2D< Real >* area;
-    tnlCircle2D< Real >* circle;
-    Index X, Y, level;
-};
-
-#include "tnlNode_impl.h"
-#endif // _TNLNODE_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlNode_impl.h b/src/TNL/legacy/vdb/tnlNode_impl.h
deleted file mode 100644
index fb6f80b54e1b80045f5532dd7fcc684b053cb061..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlNode_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _TNLNODE_IMPL_H_INCLUDED_
-#define _TNLNODE_IMPL_H_INCLUDED_
-
-#include "tnlNode.h"
-#include "tnlVDBMath.h"
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlNode< Real, Index, LogX, LogY >::tnlNode( tnlArea2D< Real >* area,
-                                             tnlCircle2D< Real >* circle,
-                                             Index X,
-                                             Index Y,
-                                             Index level )
-{
-    this->area = area;
-    this->circle = circle;
-    this->level = level;
-    this->X = X;
-    this->Y = Y;
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-Index tnlNode< Real, Index, LogX, LogY >::getLevel()
-{
-    return this->level;
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-void tnlNode< Real, Index, LogX, LogY >::setNode( Index splitX,
-                                                  Index splitY,
-                                                  tnlBitmaskArray< LogX * LogY >* bitmaskArray )
-{
-
-    Index depthX = splitX * tnlVDBMath< Index >::power( LogX, this->level - 1 );
-    Index depthY = splitY * tnlVDBMath< Index >::power( LogY, this->level - 1 );
-    Real stepX = ( Real ) this->area->getLengthX() / depthX;
-    Real stepY = ( Real ) this->area->getLengthY() / depthY;
-    Real startX = this->X * stepX;
-    Real endX = ( this->X + 1 ) * stepX;
-    Real startY = this->Y * stepY;
-    Real endY = ( this->Y + 1 ) * stepY;
-    Real dx = ( endX - startX ) / LogX;
-    Real dy = ( endY - startY ) / LogY;
-    for( Index i = 0; i < LogY; i++ )
-        for( Index j = 0; j < LogX; j++ )
-        {
-            Real x1 = startX + j * dx;
-            Real x2 = startX + ( j + 1 ) * dx;
-            Real y1 = startY + i * dy;
-            Real y2 = startY + ( i + 1 ) * dy;
-            bool state = this->circle->isIntercept( x1, x2, y1, y2 );
-            Index posX = this->X * LogX + j;
-            Index posY = this->Y * LogY + i;
-            tnlBitmask* bitmask = new tnlBitmask( state, posX, posY );
-            bitmaskArray->setIthBitmask( i * LogX + j, bitmask );
-        }
-}
-
-template< typename Real,
-          typename Index,
-          Index LogX,
-          Index LogY >
-tnlNode< Real, Index, LogX, LogY >::~tnlNode()
-{
-    this->area = NULL;
-    this->circle = NULL;
-}
-
-#endif // _TNLNODE_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlRootNode.h b/src/TNL/legacy/vdb/tnlRootNode.h
deleted file mode 100644
index b944549c5d4b32e647438388d8590bd7d8b1ddc9..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlRootNode.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _TNLROOTNODE_H_INCLUDED_
-#define _TNLROOTNODE_H_INCLUDED_
-
-#include "tnlNode.h"
-
-template< typename Real,
-          typename Index,
-          unsigned Size,
-          Index LogX,
-          Index LogY = LogX >
-class tnlRootNode : public tnlNode< Real, Index, LogX, LogY >
-{
-public:
-    tnlRootNode( tnlArea2D< Real >* area,
-                 tnlCircle2D< Real >* circle,
-                 unsigned nodesX,
-                 unsigned nodesY,
-                 unsigned depth );
-
-    void setNode();
-
-    void createTree();
-
-    void write();
-
-    ~tnlRootNode();
-
-private:
-    unsigned nodesX;
-    unsigned nodesY;
-    tnlBitmaskArray< Size >* bitmaskArray;
-    tnlNode< Real, Index, LogX, LogY >* children[ Size ];
-    unsigned depth;
-};
-
-#include "tnlRootNode_impl.h"
-#endif // _TNLROOTNODE_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlRootNode_impl.h b/src/TNL/legacy/vdb/tnlRootNode_impl.h
deleted file mode 100644
index 8bd0ecb01553b005c6e21a744979be78f814be55..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlRootNode_impl.h
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef _TNLROOTNODE_IMPL_H_INCLUDED_
-#define _TNLROOTNODE_IMPL_H_INCLUDED_
-
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include "tnlNode.h"
-#include "tnlRootNode.h"
-#include "tnlInternalNode.h"
-#include "tnlLeafNode.h"
-
-
-template< typename Real,
-          typename Index,
-          unsigned Size,
-          Index LogX,
-          Index LogY >
-tnlRootNode< Real, Index, Size, LogX, LogY >::tnlRootNode( tnlArea2D< Real >* area,
-                                                           tnlCircle2D< Real >* circle,
-                                                           unsigned nodesX,
-                                                           unsigned nodesY,
-                                                           unsigned depth )
-: tnlNode< Real, Index, LogX, LogY >::tnlNode( area, circle, 0, 0, 0 )
-{
-    this->nodesX = nodesX;
-    this->nodesY = nodesY;
-    this->bitmaskArray = new tnlBitmaskArray< Size >();
-    this->depth = depth;
-}
-
-template< typename Real,
-          typename Index,
-          unsigned Size, 
-          Index LogX,
-          Index LogY >
-void tnlRootNode< Real, Index, Size, LogX, LogY >::setNode()
-{
-    Real stepX = ( this->area->getEndX() - this->area->getStartX() ) / this->nodesX;
-    Real stepY = ( this->area->getEndY() - this->area->getStartY() ) / this->nodesY;
-    Real startX = this->area->getStartX();
-    Real startY = this->area->getStartY();
-    for( Index i = 0; i < this->nodesX; i++ )
-        for( Index j = 0; j < this->nodesY; j++ )
-        {
-            Real x1 = startX + j * stepX;
-            Real x2 = startX + ( j + 1 ) * stepX;
-            Real y1 = startY + i * stepY;
-            Real y2 = startY + ( i + 1 ) * stepY;
-            bool state = this->circle->isIntercept( x1, x2, y1, y2 );
-            Index X = j;
-            Index Y = i;
-            tnlBitmask* bitmask = new tnlBitmask( state, X, Y );
-            this->bitmaskArray->setIthBitmask( i * this->nodesX + j, bitmask);
-        }
-}
-
-template< typename Real,
-          typename Index,
-          unsigned Size,
-          Index LogX,
-          Index LogY >
-void tnlRootNode< Real, Index, Size, LogX, LogY >::createTree()
-{
-    this->setNode(); // first we need to create root node
-    for( Index i = 0; i < this->nodesY; i++ )
-        for( Index j = 0; j < this-> nodesX; j++ )
-        {
-            Index index = i * this->nodesY + j;
-            if( !this->bitmaskArray->getIthBitmask( index )->getState() )
-                this->children[ index ] = NULL;
-            else if( this->level < this->depth - 1 )
-            {
-                Index X = j;
-                Index Y = i;
-                this->children[ index ] = new tnlInternalNode< Real, Index, LogX, LogY >( this->area,
-                                                                                          this->circle,
-                                                                                          X,
-                                                                                          Y,
-                                                                                          this->level + 1 );
-                this->children[ index ]->setNode( nodesX, nodesY, this->depth );
-            }
-            else
-            {
-                Index X = j;
-                Index Y = i;
-                this->children[ index ] = new tnlLeafNode< Real, Index, LogX, LogY >( this->area,
-                                                                                      this->circle,
-                                                                                      X,
-                                                                                      Y,
-                                                                                      this->level + 1 );
-                this->children[ index ]->setNode( nodesX, nodesY, this->depth );
-            }
-        }
-}
-
-template< typename Real,
-          typename Index,
-          unsigned Size,
-          Index LogX,
-          Index LogY >
-void tnlRootNode< Real, Index, Size, LogX, LogY >::write()
-{
-    for( Index i = 0; i < this->depth; i++ )
-    {
-        std::string filename = "nodesLevel_" + std::to_string( i );
-        fstream f;
-        f.open( filename, ios::out | ios::trunc );
-        Index startX = this->area->getStartX();
-        Index endX = this->area->getEndX();
-        Index startY = this->area->getStartY();
-        Index endY = this->area->getEndY();
-        f << "startx=" << setw( 10 ) << startX
-          << ", endx=" << setw( 10 ) << endX
-          << ", starty=" <<setw( 10 ) << startY
-          << ", endy=" << setw( 10 ) << endY
-          << ", level=" << setw( 10 ) << i
-          << std::endl;
-        f << "rootSplitX=" << setw( 10 ) << this->nodesX
-          << ", rootSplitY=" << setw( 10 ) << this->nodesY
-          << ", LogX=" << setw( 10 ) << LogX
-          << ", LogY=" << setw( 10 ) << LogY 
-          << std::endl << std::endl;
-        for( Index j = 0; j < Size; j++ )
-        {
-            if( this->level == i )
-            {
-                Index x = this->bitmaskArray->getIthBitmask( j )->getX();
-                Index y = this->bitmaskArray->getIthBitmask( j )->getY();
-                bool state = this->bitmaskArray->getIthBitmask( j )->getState();
-                f << "x=" << setw( 10 ) << x
-                  << ", y=" << setw( 10 ) << y
-                  << ", state=" << setw( 1 ) << state
-                  << std::endl;
-            }
-            else if( this->children[ j ] )
-                this->children[ j ]->write( f, i );
-        }
-    }
-}
-
-template< typename Real,
-          typename Index,
-          unsigned Size,
-          Index LogX,
-          Index LogY >
-tnlRootNode< Real, Index, Size, LogX, LogY >::~tnlRootNode()
-{
-    delete this->bitmaskArray;
-    for( Index i = 0; i < Size; i++ ) 
-        delete this->children[ i ];
-    delete [] this->children;
-}
-
-#endif // _TNLROOTNODE_IMPL_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/tnlRootNode_test.cpp b/src/TNL/legacy/vdb/tnlRootNode_test.cpp
deleted file mode 100644
index 09ac03c49f6d32c63e7f31e62fed4400800fcb40..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlRootNode_test.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cstdlib>
-#include <ctime>
-#include "tnlRootNode.h"
-#include "tnlArea2D.h"
-#include "tnlCircle2D.h"
-
-int main( int argc,  char** argv )
-{
-    clock_t begin = clock();
-    int areaStart = atoi( argv[ 1 ] );
-    int areaEnd = atoi( argv[ 2 ] );
-    int circleX = atoi( argv[ 3 ] );
-    int circleY = atoi( argv[ 4 ] );
-    int radius = atoi( argv[ 5 ] );
-    const unsigned x = 4;
-    const unsigned y = 4;
-    const unsigned size = x * y;
-    tnlArea2D< double >* area = new tnlArea2D< double >( areaStart, areaEnd, areaStart, areaEnd );
-    tnlCircle2D< double >* circle = new tnlCircle2D< double >( circleX, circleY, radius );
-    tnlRootNode< double, int, size, x, y >* root = new tnlRootNode< double, int, size, x, y >( area, circle, x, y, 6 );
-    root->createTree();
-    clock_t end1 = clock();
-    root->write();
-    clock_t end2 = clock();
-    std::cout << "Tree created in " << ( ( double ) (end1 - begin) ) / CLOCKS_PER_SEC << "s" << std::endl;
-    std::cout << "Tree traversed in " << ( ( double )(end2 - begin) ) / CLOCKS_PER_SEC << "s" << std::endl;
-    return 0;
-}
diff --git a/src/TNL/legacy/vdb/tnlVDBMath.h b/src/TNL/legacy/vdb/tnlVDBMath.h
deleted file mode 100644
index 4917a19f263e14145948dc61e30bcdecce965532..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/tnlVDBMath.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _TNLVDBMATH_H_INCLUDED_
-#define _TNLVDBMATH_H_INCLUDED_
-
-template< typename Index >
-class tnlVDBMath
-{
-public:
-    static Index power( Index number,
-                        Index exponent )
-    {
-        Index result = 1;
-        for( Index i = 0; i < exponent; i++ )
-            result *= number;
-        return result;
-    }
-
-};
-
-#endif // _TNLVDBMATH_H_INCLUDED_
diff --git a/src/TNL/legacy/vdb/unittests_vdb/tnlBitmaskArray_test.cpp b/src/TNL/legacy/vdb/unittests_vdb/tnlBitmaskArray_test.cpp
deleted file mode 100644
index 93de1856b54098539d5798c7fc8c46ae7a518a3a..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/unittests_vdb/tnlBitmaskArray_test.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <iostream>
-#include "tnlBitmaskArray.h"
-
-int main()
-{
-    tnlBitmaskArray< 5 >* arr = new tnlBitmaskArray< 5 >();
-    return 0;
-}
diff --git a/src/TNL/legacy/vdb/unittests_vdb/tnlBitmask_unitTest.cpp b/src/TNL/legacy/vdb/unittests_vdb/tnlBitmask_unitTest.cpp
deleted file mode 100644
index da0d6df0822286a7e50afe1dd6fb60dd312b5b30..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/unittests_vdb/tnlBitmask_unitTest.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <iostream>
-#include <cstdint>
-#include "tnlBitmask.h"
-
-using namespace std;
-
-int main()
-{
-    for( int i = 0; i < 50000; i++ )
-    {
-        bool state = i % 2;
-        unsigned x = rand() % ( 1 << 30 );
-        unsigned y = rand() % ( 1 << 30 );
-        tnlBitmask* mask = new tnlBitmask( state, x, y );
-        if( state != mask->getState() ||
-            x != mask->getX() ||
-            y != mask->getY() )
-            cout << "state = " << state << ", mask.getState() = " << mask->getState()
-            << "x = " << x << ", mask.getX() = " << mask->getX()
-            << "y = " << y << ", mask.getY() = " << mask->getY() << endl;
-    }
-}
diff --git a/src/TNL/legacy/vdb/unittests_vdb/tnlCircle2D_unitTest.cpp b/src/TNL/legacy/vdb/unittests_vdb/tnlCircle2D_unitTest.cpp
deleted file mode 100644
index 1c177eb8e35c512bc9faca11d22aad2e248ee669..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/unittests_vdb/tnlCircle2D_unitTest.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <iostream>
-#include "tnlCircle2D.h"
-
-using namespace std;
-
-int main()
-{   // dost spatnej unittest -- vylepsit
-    tnlCircle2D* circle = new tnlCircle2D( 5, 5, 4 );
-    cout << "Testing whole circle inside area: ";
-    if( circle->isIntercept( 0, 10, 0, 10, true ) )
-        cout << "Ok" << endl;
-    else
-        cout << "Test failed." << endl;
-
-    cout << "Testing whole area inside circle: ";
-    if( !circle->isIntercept( 4, 6, 4, 6, true ) )
-        cout << "Ok" << endl;
-    else
-        cout << "Test failed." << endl;
-
-    cout << "Testing left boundry intercept: ";
-    if( circle->isIntercept( 3, 7, 0, 2, true ) )
-        cout << "Ok" << endl;
-    else
-        cout << "Test failed." << endl;
-    return 0;
-}
diff --git a/src/TNL/legacy/vdb/unittests_vdb/tnlRootNode_test.cpp b/src/TNL/legacy/vdb/unittests_vdb/tnlRootNode_test.cpp
deleted file mode 100644
index de8eb97fed32be675d3b9148d161119b79eb33cb..0000000000000000000000000000000000000000
--- a/src/TNL/legacy/vdb/unittests_vdb/tnlRootNode_test.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <iostream>
-#include "tnlRootNode.h"
-#include "tnlArea2D.h"
-#include "tnlCircle2D.h"
-
-int main()
-{
-    const unsigned x = 4;
-    const unsigned y = 4;
-    const unsigned size = x * y;
-    tnlArea2D* area = new tnlArea2D( 0, 20, 0, 20 );
-    tnlCircle2D* circle = new tnlCircle2D( 10, 10, 4 );
-    tnlRootNode< size >* root = new tnlRootNode< size >( area, circle, x, y );
-    root->setNode();
-    root->printStates();
-    return 0;
-}
diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h
index 109e947649bec236bcb7b7e64bcc84eacd44aef7..aff044601cfcc326fa43134d6da5d903cdabf5f5 100644
--- a/src/UnitTests/Containers/ArrayOperationsTest.h
+++ b/src/UnitTests/Containers/ArrayOperationsTest.h
@@ -209,11 +209,11 @@ TYPED_TEST( ArrayOperationsTest, allocateMemory_cuda )
 
    ValueType* data;
    ArrayOperations< Devices::Cuda >::allocateMemory( data, size );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
    ASSERT_NE( data, nullptr );
 
    ArrayOperations< Devices::Cuda >::freeMemory( data );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
 }
 
 TYPED_TEST( ArrayOperationsTest, setMemoryElement_cuda )
@@ -223,7 +223,7 @@ TYPED_TEST( ArrayOperationsTest, setMemoryElement_cuda )
 
    ValueType* data;
    ArrayOperations< Devices::Cuda >::allocateMemory( data, size );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
 
    for( int i = 0; i < size; i++ )
       ArrayOperations< Devices::Cuda >::setMemoryElement( &data[ i ], (ValueType) i );
@@ -237,7 +237,7 @@ TYPED_TEST( ArrayOperationsTest, setMemoryElement_cuda )
    }
 
    ArrayOperations< Devices::Cuda >::freeMemory( data );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
 }
 
 TYPED_TEST( ArrayOperationsTest, setMemory_cuda )
@@ -250,9 +250,9 @@ TYPED_TEST( ArrayOperationsTest, setMemory_cuda )
    ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size );
    ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 0, size );
    ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 13, size );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
    ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ValueType, ValueType >( hostData, deviceData, size );
-   ASSERT_TRUE( TNL_CHECK_CUDA_DEVICE );
+   ASSERT_NO_THROW( TNL_CHECK_CUDA_DEVICE );
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( hostData[ i ], 13 );
    ArrayOperations< Devices::Host >::freeMemory( hostData );
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index e451c0416f5f14e63af1e2a4a3bb117159f16229..f92954df3be4b2bb850859cad143f1ec9d0daae2 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -62,19 +62,16 @@ protected:
 
 // types for which ArrayTest is instantiated
 using ArrayTypes = ::testing::Types<
-   Array< short,  Devices::Host, short >,
    Array< int,    Devices::Host, short >,
    Array< long,   Devices::Host, short >,
    Array< float,  Devices::Host, short >,
    Array< double, Devices::Host, short >,
    Array< MyData, Devices::Host, short >,
-   Array< short,  Devices::Host, int >,
    Array< int,    Devices::Host, int >,
    Array< long,   Devices::Host, int >,
    Array< float,  Devices::Host, int >,
    Array< double, Devices::Host, int >,
    Array< MyData, Devices::Host, int >,
-   Array< short,  Devices::Host, long >,
    Array< int,    Devices::Host, long >,
    Array< long,   Devices::Host, long >,
    Array< float,  Devices::Host, long >,
@@ -83,19 +80,16 @@ using ArrayTypes = ::testing::Types<
    // FIXME: this segfaults in String::~String()
 //   Array< String, Devices::Host, long >,
 #ifdef HAVE_CUDA
-   Array< short,  Devices::Cuda, short >,
    Array< int,    Devices::Cuda, short >,
    Array< long,   Devices::Cuda, short >,
    Array< float,  Devices::Cuda, short >,
    Array< double, Devices::Cuda, short >,
    Array< MyData, Devices::Cuda, short >,
-   Array< short,  Devices::Cuda, int >,
    Array< int,    Devices::Cuda, int >,
    Array< long,   Devices::Cuda, int >,
    Array< float,  Devices::Cuda, int >,
    Array< double, Devices::Cuda, int >,
    Array< MyData, Devices::Cuda, int >,
-   Array< short,  Devices::Cuda, long >,
    Array< int,    Devices::Cuda, long >,
    Array< long,   Devices::Cuda, long >,
    Array< float,  Devices::Cuda, long >,
@@ -103,21 +97,18 @@ using ArrayTypes = ::testing::Types<
    Array< MyData, Devices::Cuda, long >,
 #endif
 #ifdef HAVE_MIC
-   Array< short,  Devices::MIC, short >,
    Array< int,    Devices::MIC, short >,
    Array< long,   Devices::MIC, short >,
    Array< float,  Devices::MIC, short >,
    Array< double, Devices::MIC, short >,
    // TODO: MyData does not work on MIC
 //   Array< MyData, Devices::MIC, short >,
-   Array< short,  Devices::MIC, int >,
    Array< int,    Devices::MIC, int >,
    Array< long,   Devices::MIC, int >,
    Array< float,  Devices::MIC, int >,
    Array< double, Devices::MIC, int >,
    // TODO: MyData does not work on MIC
 //   Array< MyData, Devices::MIC, int >,
-   Array< short,  Devices::MIC, long >,
    Array< int,    Devices::MIC, long >,
    Array< long,   Devices::MIC, long >,
    Array< float,  Devices::MIC, long >,
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index 7d37a3b78be2c5c006fc48ad9a943a6bd27ca39b..2fa6fb3afd551dabf3b26386004096ce5028226a 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -14,7 +14,9 @@
 #include <type_traits>
 
 #include <TNL/Containers/Array.h>
+#include <TNL/Containers/Vector.h>
 #include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/VectorView.h>
 
 #include "gtest/gtest.h"
 
@@ -53,91 +55,95 @@ std::ostream& operator<<( std::ostream& str, const MyData& v )
 
 
 // test fixture for typed tests
-template< typename Array >
+template< typename View >
 class ArrayViewTest : public ::testing::Test
 {
 protected:
-   using ArrayType = Array;
-   using ViewType = ArrayView< typename Array::ValueType, typename Array::DeviceType, typename Array::IndexType >;
+   using ViewType = View;
+   using ArrayType = Array< typename View::ValueType, typename View::DeviceType, typename View::IndexType >;
 };
 
 // types for which ArrayViewTest is instantiated
-using ArrayTypes = ::testing::Types<
-   Array< short,  Devices::Host, short >,
-   Array< int,    Devices::Host, short >,
-   Array< long,   Devices::Host, short >,
-   Array< float,  Devices::Host, short >,
-   Array< double, Devices::Host, short >,
-   Array< MyData, Devices::Host, short >,
-   Array< short,  Devices::Host, int >,
-   Array< int,    Devices::Host, int >,
-   Array< long,   Devices::Host, int >,
-   Array< float,  Devices::Host, int >,
-   Array< double, Devices::Host, int >,
-   Array< MyData, Devices::Host, int >,
-   Array< short,  Devices::Host, long >,
-   Array< int,    Devices::Host, long >,
-   Array< long,   Devices::Host, long >,
-   Array< float,  Devices::Host, long >,
-   Array< double, Devices::Host, long >,
-   Array< MyData, Devices::Host, long >
+using ViewTypes = ::testing::Types<
+   ArrayView< int,    Devices::Host, short >,
+   ArrayView< long,   Devices::Host, short >,
+   ArrayView< float,  Devices::Host, short >,
+   ArrayView< double, Devices::Host, short >,
+   ArrayView< MyData, Devices::Host, short >,
+   ArrayView< int,    Devices::Host, int >,
+   ArrayView< long,   Devices::Host, int >,
+   ArrayView< float,  Devices::Host, int >,
+   ArrayView< double, Devices::Host, int >,
+   ArrayView< MyData, Devices::Host, int >,
+   ArrayView< int,    Devices::Host, long >,
+   ArrayView< long,   Devices::Host, long >,
+   ArrayView< float,  Devices::Host, long >,
+   ArrayView< double, Devices::Host, long >,
+   ArrayView< MyData, Devices::Host, long >,
    // FIXME: this segfaults in String::~String()
-//   , Array< String, Devices::Host, long >
+//   , ArrayView< String, Devices::Host, long >,
 #ifdef HAVE_CUDA
-   ,
-   Array< short,  Devices::Cuda, short >,
-   Array< int,    Devices::Cuda, short >,
-   Array< long,   Devices::Cuda, short >,
-   Array< float,  Devices::Cuda, short >,
-   Array< double, Devices::Cuda, short >,
-   Array< MyData, Devices::Cuda, short >,
-   Array< short,  Devices::Cuda, int >,
-   Array< int,    Devices::Cuda, int >,
-   Array< long,   Devices::Cuda, int >,
-   Array< float,  Devices::Cuda, int >,
-   Array< double, Devices::Cuda, int >,
-   Array< MyData, Devices::Cuda, int >,
-   Array< short,  Devices::Cuda, long >,
-   Array< int,    Devices::Cuda, long >,
-   Array< long,   Devices::Cuda, long >,
-   Array< float,  Devices::Cuda, long >,
-   Array< double, Devices::Cuda, long >,
-   Array< MyData, Devices::Cuda, long >
+   ArrayView< int,    Devices::Cuda, short >,
+   ArrayView< long,   Devices::Cuda, short >,
+   ArrayView< float,  Devices::Cuda, short >,
+   ArrayView< double, Devices::Cuda, short >,
+   ArrayView< MyData, Devices::Cuda, short >,
+   ArrayView< int,    Devices::Cuda, int >,
+   ArrayView< long,   Devices::Cuda, int >,
+   ArrayView< float,  Devices::Cuda, int >,
+   ArrayView< double, Devices::Cuda, int >,
+   ArrayView< MyData, Devices::Cuda, int >,
+   ArrayView< int,    Devices::Cuda, long >,
+   ArrayView< long,   Devices::Cuda, long >,
+   ArrayView< float,  Devices::Cuda, long >,
+   ArrayView< double, Devices::Cuda, long >,
+   ArrayView< MyData, Devices::Cuda, long >,
 #endif
 #ifdef HAVE_MIC
-   ,
-   Array< short,  Devices::MIC, short >,
-   Array< int,    Devices::MIC, short >,
-   Array< long,   Devices::MIC, short >,
-   Array< float,  Devices::MIC, short >,
-   Array< double, Devices::MIC, short >,
+   ArrayView< int,    Devices::MIC, short >,
+   ArrayView< long,   Devices::MIC, short >,
+   ArrayView< float,  Devices::MIC, short >,
+   ArrayView< double, Devices::MIC, short >,
    // TODO: MyData does not work on MIC
-//   Array< MyData, Devices::MIC, short >,
-   Array< short,  Devices::MIC, int >,
-   Array< int,    Devices::MIC, int >,
-   Array< long,   Devices::MIC, int >,
-   Array< float,  Devices::MIC, int >,
-   Array< double, Devices::MIC, int >,
+//   ArrayView< MyData, Devices::MIC, short >,
+   ArrayView< int,    Devices::MIC, int >,
+   ArrayView< long,   Devices::MIC, int >,
+   ArrayView< float,  Devices::MIC, int >,
+   ArrayView< double, Devices::MIC, int >,
    // TODO: MyData does not work on MIC
-//   Array< MyData, Devices::MIC, int >,
-   Array< short,  Devices::MIC, long >,
-   Array< int,    Devices::MIC, long >,
-   Array< long,   Devices::MIC, long >,
-   Array< float,  Devices::MIC, long >,
-   Array< double, Devices::MIC, long >
+//   ArrayView< MyData, Devices::MIC, int >,
+   ArrayView< int,    Devices::MIC, long >,
+   ArrayView< long,   Devices::MIC, long >,
+   ArrayView< float,  Devices::MIC, long >,
+   ArrayView< double, Devices::MIC, long >,
    // TODO: MyData does not work on MIC
-//   Array< MyData, Devices::MIC, long >
+//   ArrayView< MyData, Devices::MIC, long >,
+#endif
+
+   // all ArrayView tests should also work with VectorView
+   // (but we can't test all types because the argument list would be too long...)
+   VectorView< float,  Devices::Host, long >,
+   VectorView< double, Devices::Host, long >
+#ifdef HAVE_CUDA
+   ,
+   VectorView< float,  Devices::Cuda, long >,
+   VectorView< double, Devices::Cuda, long >
+#endif
+#ifdef HAVE_MIC
+   ,
+   VectorView< float,  Devices::MIC, long >,
+   VectorView< double, Devices::MIC, long >
 #endif
 >;
 
-TYPED_TEST_CASE( ArrayViewTest, ArrayTypes );
+TYPED_TEST_CASE( ArrayViewTest, ViewTypes );
 
 
 TYPED_TEST( ArrayViewTest, constructors )
 {
    using ArrayType = typename TestFixture::ArrayType;
    using ViewType = typename TestFixture::ViewType;
-   using ConstViewType = ArrayView< const typename ArrayType::ValueType, typename ArrayType::DeviceType, typename ArrayType::IndexType >;
+   using ConstViewType = VectorView< const typename ArrayType::ValueType, typename ArrayType::DeviceType, typename ArrayType::IndexType >;
 
    ArrayType a( 10 );
    EXPECT_EQ( a.getSize(), 10 );
@@ -410,6 +416,7 @@ TYPED_TEST( ArrayViewTest, assignmentOperator )
 {
    using ArrayType = typename TestFixture::ArrayType;
    using ViewType = typename TestFixture::ViewType;
+   using ConstViewType = VectorView< const typename ArrayType::ValueType, typename ArrayType::DeviceType, typename ArrayType::IndexType >;
 
    ArrayType a( 10 ), b( 10 );
    typename ArrayType::HostType a_host( 10 );
@@ -437,6 +444,11 @@ TYPED_TEST( ArrayViewTest, assignmentOperator )
    u_host = u;
    EXPECT_EQ( u_host, u );
    EXPECT_EQ( u_host.getData(), a_host.getData() );
+
+   // assignment of const view to non-const view
+   v.setValue( 0 );
+   ConstViewType c( u );
+   v = c;
 }
 
 // test works only for arithmetic types
diff --git a/src/UnitTests/Containers/VectorTest.h b/src/UnitTests/Containers/VectorTest.h
index d2cf8217f80fbe4503eed73f62ab81bd3970fe6a..47d3908921dcdcdda5689df8ba8e4e424b84c2f2 100644
--- a/src/UnitTests/Containers/VectorTest.h
+++ b/src/UnitTests/Containers/VectorTest.h
@@ -755,6 +755,23 @@ TEST( VectorSpecialCasesTest, initializationOfVectorViewByArrayView )
    EXPECT_EQ( v_view.sum(), 0 );
 }
 
+TEST( VectorSpecialCasesTest, defaultConstructors )
+{
+   using ArrayType = Containers::Array< int, Devices::Host >;
+   using VectorViewType = VectorView< int, Devices::Host >;
+   using ArrayViewType = ArrayView< int, Devices::Host >;
+
+   ArrayType a( 100 );
+   a.setValue( 0 );
+
+   ArrayViewType a_view;
+   a_view.bind( a );
+
+   VectorViewType v_view;
+   v_view.bind( a );
+   EXPECT_EQ( v_view.getData(), a_view.getData() );
+}
+
 #endif // HAVE_GTEST
 
 
diff --git a/src/UnitTests/Functions/Functions.h b/src/UnitTests/Functions/Functions.h
index 59a0335c15af53b880f716318e571688f593d91c..88ba18bc8784cf72eb59301185b6c6c9a7acd321 100644
--- a/src/UnitTests/Functions/Functions.h
+++ b/src/UnitTests/Functions/Functions.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          DistributedGridTest.cpp  -  description
+                          Functions.h  -  description
                              -------------------
     begin                : Sep 6, 2017
     copyright            : (C) 2017 by Tomas Oberhuber et al.
diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
index 4907f1d269cfc3d7f5d205405546675f1b8d58fe..000a832b6011cd7b444ed27d141a8debbfec7c38 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp
@@ -578,7 +578,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
             StaticVector<1,typename CutMeshType::IndexType>(4) );
 
    
-   String FileName=String("/tmp/test-file.tnl");
+   String FileName=String("test-file.tnl");
    if(inCut)
    {
        MeshFunction<CutMeshType> cutMeshFunction;
@@ -612,7 +612,8 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
        loadMeshFunctionptr.bind(globalCutGrid,loaddof);
 
         File file;
-        file.open( FileName, IOMode::read );
+        bool ok=file.open( FileName, IOMode::read );
+        TNL_ASSERT_TRUE(ok,"Cannot open file");
         loadMeshFunctionptr.boundLoad(file);
         file.close();
  
@@ -630,9 +631,12 @@ TEST(CutDistributedMeshFunction, 3D_2_Save)
                 fromEntity.refresh();
                 outEntity.refresh();
 
-                EXPECT_EQ(loadMeshFunctionptr.getValue(outEntity), (*linearFunctionPtr)(fromEntity)) << "Error in Left overlap";
+                EXPECT_EQ(loadMeshFunctionptr.getValue(outEntity), (*linearFunctionPtr)(fromEntity)) << "Error at "<< i <<" "<< j;
 
             }
+
+        EXPECT_EQ( std::remove( FileName.getString()) , 0 );
+        EXPECT_EQ( std::remove( (FileName+String("-mesh.tnl")).getString()) , 0 );
       }
   
 }
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
index e8625bc3d6ab57d3ae01b23d99e68dc34792d7a2..0d2bc77ede3577153944398c8a61c6370d1bf876 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
+++ b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp
@@ -3,7 +3,6 @@
 
 #include <TNL/Meshes/DistributedMeshes/Directions.h>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/StaticVectorFor.h>
 
 using namespace TNL::Meshes::DistributedMeshes;
 using namespace TNL::Containers;
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
index 9ac299621b15b85ff2bd31126af19c647a9158e2..5a0a43bcfb31c48da4a1069bde5f1f3847eba3f7 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h
@@ -244,8 +244,8 @@ class TestDistributedGridIO
             
         linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
  
-        String FileName=String("/tmp/test-file.tnl");
-        DistributedGridIO<MeshFunctionType> ::save(FileName, *meshFunctionptr );
+        String fileName=String("test-file-distriburtegrid-io-save.tnl");
+        DistributedGridIO<MeshFunctionType> ::save(fileName, *meshFunctionptr );
 
 
        //create similar local mesh function and evaluate linear function on it
@@ -273,8 +273,10 @@ class TestDistributedGridIO
 
         loadDof.setValue(-1);
         
+        String localFileName= fileName+String("-")+distributedGrid.printProcessCoords()+String(".tnl");
+
         File file;
-        file.open( FileName+String("-")+distributedGrid.printProcessCoords(), IOMode::read );
+        file.open(localFileName, IOMode::read );
         loadMeshFunctionptr->boundLoad(file);
         file.close();
 
@@ -282,6 +284,11 @@ class TestDistributedGridIO
         {
             EXPECT_EQ( localDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
+
+        EXPECT_EQ( std::remove( localFileName.getString()) , 0 );
+
+       //remove meshfile
+       EXPECT_EQ( std::remove( (fileName+String("-mesh-")+distributedGrid.printProcessCoords()+String(".tnl")).getString()) , 0 );
     }
     
     static void TestLoad()
@@ -326,9 +333,10 @@ class TestDistributedGridIO
         linearFunctionEvaluator.evaluateAllEntities(localMeshFunctionptr , linearFunctionPtr);
 
 
-        String FileName=String("/tmp/test-file.tnl");
+        String fileName=String("test-file-distributedgrid-io-load.tnl");
+        String localFileName=fileName+String("-")+distributedGrid.printProcessCoords()+String(".tnl");
         File file;
-        file.open( FileName+String("-")+distributedGrid.printProcessCoords(), IOMode::write );        
+        file.open( localFileName, IOMode::write );        
         localMeshFunctionptr->save(file);
         file.close();
 
@@ -343,7 +351,7 @@ class TestDistributedGridIO
         loadDof.setValue(0);
         loadMeshFunctionptr->bind(loadGridptr,loadDof);
 
-        DistributedGridIO<MeshFunctionType> ::load(FileName, *loadMeshFunctionptr );
+        DistributedGridIO<MeshFunctionType> ::load(fileName, *loadMeshFunctionptr );
 
         loadMeshFunctionptr->template synchronize<CommunicatorType>(); //need synchronization for overlaps to be filled corectly in loadDof
 
@@ -363,7 +371,9 @@ class TestDistributedGridIO
         for(int i=0;i<dof.getSize();i++)
         {
             EXPECT_EQ( dof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
-        }       
+        }
+
+        EXPECT_EQ( std::remove( localFileName.getString()) , 0 );
     }
 };
 
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
index a68fe628e69bcf434dabe68953180dfa6a4a6b9c..ef0160741641be58b36291cb0cf42b9594c9cc2d 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTest.h
@@ -80,7 +80,7 @@ class TestDistributedGridMPIIO{
             
         linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
  
-        String FileName=String("/tmp/test-file.tnl");
+        String FileName=String("test-file-mpiio-save.tnl");
         DistributedGridIO<MeshFunctionType,MpiIO> ::save(FileName, *meshFunctionptr );
 
        //first process compare results
@@ -108,6 +108,7 @@ class TestDistributedGridMPIIO{
             {
               EXPECT_EQ( globalEvaluatedDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
             }
+            EXPECT_EQ( std::remove( FileName.getString()) , 0 );
         }
     }
     
@@ -135,7 +136,7 @@ class TestDistributedGridMPIIO{
         SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 );
         distributedGrid.setOverlaps( lowerOverlap, upperOverlap );
 
-        String FileName=String("/tmp/test-file.tnl");         
+        String FileName=String("/tmp/test-file-mpiio-load.tnl");         
 
         //Prepare file   
         if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
@@ -178,6 +179,11 @@ class TestDistributedGridMPIIO{
         {
             EXPECT_EQ( evalDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
         }
+
+        if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0)
+        {
+            EXPECT_EQ( std::remove( FileName.getString()) , 0 );
+        }
         
     }
 };