diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa60bf04da4e179544760b8e035c52a84b1bf1e1..431296ee74bb2d4d683a3e02b8f5d93078d02dd8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,8 +44,8 @@ endif()
 # set Debug/Release options
 set( CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
 set( CMAKE_CXX_FLAGS_DEBUG "-g" )
-set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG" )
-#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
+set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
+#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
 # pass -rdynamic only in Debug mode
 set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" )
 set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" )
@@ -91,7 +91,7 @@ if( WITH_CUDA STREQUAL "yes" )
                 set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/src/Tools/tnl-cuda-arch.cu)
                 message( "Compiling tnl-cuda-arch ..." )
                 file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
-                execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
+                execute_process( COMMAND nvcc --compiler-bindir ${CUDA_HOST_COMPILER} ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
                                  RESULT_VARIABLE CUDA_ARCH_RESULT
                                  OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
                                  ERROR_VARIABLE CUDA_ARCH_OUTPUT )
@@ -171,6 +171,7 @@ if( WITH_CUDA STREQUAL "yes" )
     endif( CUDA_FOUND )
 endif( WITH_CUDA STREQUAL "yes" )
 
+
 ####
 # Check for OpenMP
 #
@@ -360,8 +361,7 @@ CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
 INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION include/tnl-${tnlVersion}/TNL )
 if( PYTHONINTERP_FOUND )
     CONFIGURE_FILE( "Config.py.in" "${PROJECT_BUILD_PATH}/TNL/Config.py" )
-    INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/Config.py DESTINATION lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/TNL )
-    CONFIGURE_FILE( "python-version.in" "${PROJECT_TOOLS_PATH}/../python-version" )
+    INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/Config.py DESTINATION lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/TNL )
 endif( PYTHONINTERP_FOUND )
 
 #Nastavime cesty k hlavickovym souborum a knihovnam
diff --git a/TODO b/TODO
index 3f427388a1ec4b4ccb2818f3532ca98bb16b0708..e83b80a67e00b0c6fa1911f2d37919b410aa03ba 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,16 @@
+- pridet execution policy https://github.com/harrism/hemi/blob/master/hemi/execution_policy.h
+- prejmenova Assert na TNL_ASSERT a rozsirit asserce podobne jako v GTest
+- odstranit paramee lazy ze smart pointeru
+
 TODO:
  - implementovat tnlMixedGridBoundaryConditions, kde by se pro kazdou stranu gridu definoval jiny zvlastni typ
    okrajovych podminek
  - dalo by se tim resit i skladani zpetnych a doprednych diferenci u nelinearni difuze, kdy je potreba napr. dopredne diference
    vycislit i na leve a dolni hranici 2D gridu
+   - tohle resit spis primym rozepsanim schematu
 
 TODO:
- - implementovat tuple pro snazsi a snad efektoivnejsi prenos dat na GPU
+ - implementovat tuple pro snazsi a snad efektivnejsi prenos dat na GPU
  - nebylo by nutne definovat pomocne datove structury pro traverser
  - data by se na hostu preskupila do souvisleho bloku dat a ten se prenesl najednou
 
diff --git a/build b/build
index cb92740ba414ab96c34f312b5103e709b9093c2c..bd644b6a111fd298803ba4e281747b28e0c27307 100755
--- a/build
+++ b/build
@@ -20,7 +20,7 @@ HELP="no"
 VERBOSE=""
 ROOT_DIR="."
 DCMTK_DIR="/usr/include/dcmtk"
-BUILD_JOBS=`grep -c processor /proc/cpuinfo`
+BUILD_JOBS=""
 
 for option in "$@"
 do
@@ -112,9 +112,23 @@ then
     exit 1
 fi
 
-echo "Building ${BUILD} $TARGET using $BUILD_JOBS processors ..."
+if [[ -n ${BUILD_JOBS} ]]; then
+    # override $MAKEFLAGS from parent environment
+    export MAKEFLAGS=-j${BUILD_JOBS}
+elif [[ -z ${MAKEFLAGS} ]]; then
+    # $BUILD_JOBS and $MAKEFLAGS are not set => set default value
+    BUILD_JOBS=$(grep "core id" /proc/cpuinfo | sort -u | wc -l)
+    export MAKEFLAGS=-j${BUILD_JOBS}
+fi
+
+if [[ -n ${BUILD_JOBS} ]]; then
+    echo "Building ${BUILD} $TARGET using $BUILD_JOBS processors ..."
+else
+    # number of processors is unknown - it is encoded in $MAKEFLAGS from parent environment
+    echo "Building ${BUILD} $TARGET ..."
+fi
 
-make -j${BUILD_JOBS} ${VERBOSE}
+make ${VERBOSE}
 if test $? != 0; then
     echo "Error: Build process failed."
     exit 1
@@ -123,7 +137,7 @@ fi
 
 if test WITH_TESTS = "yes";
 then
-    make -j${BUILD_JOBS} test
+    make test
     if test $? != 0; then
         echo "Error: Some test did not pass successfuly."
     fi
diff --git a/examples/advection/advectionProblem.h b/examples/advection/advectionProblem.h
index a44048c08c7899a47a8ee27a6f91a5d758eff1f4..b878f37881ecb47cdb8e46aece9152c2ff84c720 100644
--- a/examples/advection/advectionProblem.h
+++ b/examples/advection/advectionProblem.h
@@ -69,7 +69,7 @@ class advectionProblem:
       void bindDofs( const MeshPointer& mesh,
                      DofVectorPointer& dofs );
 
-      void getExplicitRHS( const RealType& time,
+      void getExplicitUpdate( const RealType& time,
                            const RealType& tau,
                            const MeshPointer& mesh,
                            DofVectorPointer& _u,
diff --git a/examples/advection/advectionProblem_impl.h b/examples/advection/advectionProblem_impl.h
index cd37f52eff7454b863d14eb245b2b9d7236a5a37..bebfff5c631a979af3d0a7e35a7686cdfabf0348 100644
--- a/examples/advection/advectionProblem_impl.h
+++ b/examples/advection/advectionProblem_impl.h
@@ -242,7 +242,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 advectionProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshPointer& mesh,
                 DofVectorPointer& _u,
diff --git a/examples/heat-equation/tnl-heat-equation.h b/examples/heat-equation/tnl-heat-equation.h
index 43a42da6689b68747005a9378add270c60a522d7..bb9de072e09465d93ba8bacdda947cfc474c1438 100644
--- a/examples/heat-equation/tnl-heat-equation.h
+++ b/examples/heat-equation/tnl-heat-equation.h
@@ -64,14 +64,11 @@ class heatEquationSetter
    typedef Device DeviceType;
    typedef Index IndexType;
 
-   typedef Containers::StaticVector< MeshType::meshDimensions, Real > Vertex;
-
    static bool run( const Config::ParameterContainer& parameters )
    {
       enum { Dimensions = MeshType::meshDimensions };
       typedef Operators::LinearDiffusion< MeshType, Real, Index > ApproximateOperator;
       typedef Functions::Analytic::Constant< Dimensions, Real > RightHandSide;
-      typedef Containers::StaticVector < MeshType::meshDimensions, Real > Vertex;
 
       String boundaryConditionsType = parameters.getParameter< String >( "boundary-conditions-type" );
       if( parameters.checkParameter( "boundary-conditions-constant" ) )
diff --git a/examples/heat-equation/tnl-run-heat-equation-eoc-test b/examples/heat-equation/tnl-run-heat-equation-eoc-test
index fb39336e07eec7bbcd288cbdf3be2942404f3232..014d13ebab35272cbef5aeafbeb2a692bd368daa 100644
--- a/examples/heat-equation/tnl-run-heat-equation-eoc-test
+++ b/examples/heat-equation/tnl-run-heat-equation-eoc-test
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 device="host"
+threadsNumbers="1 2 4 6"
 dimensions="1D 2D 3D"
 #dimensions="1D"
 sizes1D="16 32 64 128 256 512"
@@ -84,6 +85,7 @@ solve()
 {
    timeDiscretisation=$1
    discreteSolver=$2
+   threadsNumber=$3
    ${solverName} --device ${device} \
                  --mesh mesh.tnl \
                  --initial-condition exact-u-00000.tnl \
@@ -113,7 +115,10 @@ solve()
                  --sigma ${sigma} \
                  --time-dependence ${timeDependence} \
                  --snapshot-period ${snapshotPeriod} \
-                 --final-time ${finalTime}
+                 --final-time ${finalTime} \
+                 --refresh-rate 50 \
+                 --openmp-enabled true \
+                 --openmp-max-threads ${threadsNumber}
 }
                
 computeError()
@@ -128,77 +133,84 @@ computeError()
 
 runTest()
 {
-   for testFunction in ${testFunctions};
+   for threadsNumber in ${threadsNumbers};
    do
-      mkdir -p ${testFunction}
-      cd ${testFunction}
-      setupTestFunction ${testFunction}
-      
-      for dim in ${dimensions};
-      do
-         mkdir -p $dim
-         cd ${dim}
-         if test $dim = 1D;
-         then 
-            sizes=$sizes1D
-         fi
-         if test $dim = 2D;
-         then 
-            sizes=$sizes2D
-         fi
-         if test $dim = 3D;
-         then 
-            sizes=$sizes3D
-         fi
-         
-         lastSize=""
-         for size in $sizes;
-         do
-            mkdir -p $size
-            cd $size
-            echo ""
-            echo ""
-            echo ""
-            if test ! -f computation-done;
-            then
-               touch computation-in-progress
-               echo "========================================================================="
-               echo "===                   SETTING UP THE GRID                             ==="
-               echo "========================================================================="
-               setupGrid $dim $size 
-               echo "========================================================================="
-               echo "===                WRITING THE EXACT SOLUTION                         ==="
-               echo "========================================================================="
-               setInitialCondition $testFunction
-               echo "========================================================================="
-               echo "===                   STARTING THE SOLVER                             ==="
-               echo "========================================================================="
-               #solve explicit merson
-               solve semi-implicit gmres
-               mv computation-in-progress computation-done
-            fi            
-            echo "========================================================================="
-            echo "===                   COMPUTING THE ERROR                             ==="
-            echo "========================================================================="
-            computeError
-            echo "========================================================================="
-            echo "===                     COMPUTING THE EOC                             ==="            
-            echo "========================================================================="
-            if test ! x$lastSize = x;
-            then
-               tnl-err2eoc ../$lastSize/errors.txt errors.txt
-            fi
-            echo "========================================================================="
-            echo "===                     COMPUTATION DONE                              ==="            
-            echo "========================================================================="
-            cd ..
-            lastSize=$size
-         done
+      mkdir -p threads-${threadsNumber}
+      cd threads-${threadsNumber}
 
-         cd ..
-      done
-      cd ..
-   done
+        for testFunction in ${testFunctions};
+        do
+           mkdir -p ${testFunction}
+           cd ${testFunction}
+           setupTestFunction ${testFunction}
+
+           for dim in ${dimensions};
+           do
+              mkdir -p $dim
+              cd ${dim}
+              if test $dim = 1D;
+              then 
+                 sizes=$sizes1D
+              fi
+              if test $dim = 2D;
+              then 
+                 sizes=$sizes2D
+              fi
+              if test $dim = 3D;
+              then 
+                 sizes=$sizes3D
+              fi
+
+              lastSize=""
+              for size in $sizes;
+              do
+                 mkdir -p $size
+                 cd $size
+                 echo ""
+                 echo ""
+                 echo ""
+                 if test ! -f computation-done;
+                 then
+                    touch computation-in-progress
+                    echo "========================================================================="
+                    echo "===                   SETTING UP THE GRID                             ==="
+                    echo "========================================================================="
+                    setupGrid $dim $size 
+                    echo "========================================================================="
+                    echo "===                WRITING THE EXACT SOLUTION                         ==="
+                    echo "========================================================================="
+                    setInitialCondition $testFunction
+                    echo "========================================================================="
+                    echo "===                   STARTING THE SOLVER                             ==="
+                    echo "========================================================================="
+                    #solve explicit merson ${threadsNumber}
+                    solve semi-implicit gmres ${threadsNumber}
+                    mv computation-in-progress computation-done
+                 fi            
+                 echo "========================================================================="
+                 echo "===                   COMPUTING THE ERROR                             ==="
+                 echo "========================================================================="
+                 computeError
+                 echo "========================================================================="
+                 echo "===                     COMPUTING THE EOC                             ==="            
+                 echo "========================================================================="
+                 if test ! x$lastSize = x;
+                 then
+                    tnl-err2eoc ../$lastSize/errors.txt errors.txt
+                 fi
+                 echo "========================================================================="
+                 echo "===                     COMPUTATION DONE                              ==="            
+                 echo "========================================================================="
+                 cd ..
+                 lastSize=$size
+              done
+
+              cd ..
+           done
+           cd ..
+        done
+        cd ..
+    done
 }
 
 runTest
diff --git a/examples/inviscid-flow/1d/eulerProblem.h b/examples/inviscid-flow/1d/eulerProblem.h
index f687ee5460accc7a593bb8f6e034872f8d8e6eda..a5804c783bc4afa04385257787693aaef919029b 100644
--- a/examples/inviscid-flow/1d/eulerProblem.h
+++ b/examples/inviscid-flow/1d/eulerProblem.h
@@ -75,7 +75,7 @@ class eulerProblem:
       void bindDofs( const MeshPointer& mesh,
                      DofVectorPointer& dofs );
 
-      void getExplicitRHS( const RealType& time,
+      void getExplicitUpdate( const RealType& time,
                            const RealType& tau,
                            const MeshPointer& mesh,
                            DofVectorPointer& _u,
diff --git a/examples/inviscid-flow/1d/eulerProblem_impl.h b/examples/inviscid-flow/1d/eulerProblem_impl.h
index 0ed9d4f1a5da37c32faa7fc0e41ef4bc92e84521..43e45d6693c7a8963d334030c266498dae3b3761 100644
--- a/examples/inviscid-flow/1d/eulerProblem_impl.h
+++ b/examples/inviscid-flow/1d/eulerProblem_impl.h
@@ -250,7 +250,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 eulerProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshPointer& mesh,
                 DofVectorPointer& _u,
diff --git a/examples/inviscid-flow/2d/eulerProblem.h b/examples/inviscid-flow/2d/eulerProblem.h
index 71a9a7a7c1b71f96afd3c289258f27225337f854..60fa8e0df8f5c7c44c4219f9076c4e2afdfa6184 100644
--- a/examples/inviscid-flow/2d/eulerProblem.h
+++ b/examples/inviscid-flow/2d/eulerProblem.h
@@ -76,7 +76,7 @@ class eulerProblem:
       void bindDofs( const MeshPointer& mesh,
                      DofVectorPointer& dofs );
 
-      void getExplicitRHS( const RealType& time,
+      void getExplicitUpdate( const RealType& time,
                            const RealType& tau,
                            const MeshPointer& mesh,
                            DofVectorPointer& _u,
diff --git a/examples/inviscid-flow/2d/eulerProblem_impl.h b/examples/inviscid-flow/2d/eulerProblem_impl.h
index 27d4aa4ae67b7cacf7fdfb13214349a8a996a874..7b5fc93f8389a0808d454eb5a5f238e5f29699cf 100644
--- a/examples/inviscid-flow/2d/eulerProblem_impl.h
+++ b/examples/inviscid-flow/2d/eulerProblem_impl.h
@@ -235,7 +235,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 eulerProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshPointer& mesh,
                 DofVectorPointer& _u,
diff --git a/examples/navier-stokes/navierStokesSolver.h b/examples/navier-stokes/navierStokesSolver.h
index efdc431a7885e4078b6199e94894407fc374898b..5e05a75aeec5e2bc788ee305cbb1ad3a48a91ea7 100644
--- a/examples/navier-stokes/navierStokesSolver.h
+++ b/examples/navier-stokes/navierStokesSolver.h
@@ -93,7 +93,7 @@ class navierStokesSolver
 
    bool solve();
 
-   void GetExplicitRHS( const RealType& time,
+   void getExplicitUpdate( const RealType& time,
                         const RealType& tau,
                         DofVectorType& _u,
                         DofVectorType& _fu );
diff --git a/examples/navier-stokes/navierStokesSolver_impl.h b/examples/navier-stokes/navierStokesSolver_impl.h
index 8fa92c2cc1624ea6ca167ee8475ff53ddd980208..b1f607475c396f4905909474b0f2bf7324eea2ee 100644
--- a/examples/navier-stokes/navierStokesSolver_impl.h
+++ b/examples/navier-stokes/navierStokesSolver_impl.h
@@ -260,12 +260,12 @@ bool navierStokesSolver< Mesh, EulerScheme > :: makeSnapshot( const RealType& t,
 }
 
 template< typename Mesh, typename EulerScheme >
-void navierStokesSolver< Mesh, EulerScheme > :: GetExplicitRHS(  const RealType& time,
+void navierStokesSolver< Mesh, EulerScheme > :: getExplicitUpdate(  const RealType& time,
                                                                  const RealType& tau,
                                                                  DofVectorType& u,
                                                                  DofVectorType& fu )
 {
-   nsSolver.getExplicitRhs( time, tau, u, fu );
+   nsSolver.getExplicitUpdate( time, tau, u, fu );
    solverMonitor.uMax = this->mesh.getAbsMax( nsSolver.getU() );
    solverMonitor.uAvg = this->mesh.getLpNorm( nsSolver.getU(), 1.0 );
    solverMonitor.eMax = this->mesh.getAbsMax( nsSolver.getEnergy() );
diff --git a/install b/install
index c0fc3f6023f3968b999de3f32b3b910ae1d2484b..139fe67d3b2cd3b5bec3d973a1d336e071ea5182 100755
--- a/install
+++ b/install
@@ -107,27 +107,5 @@ then
     BUILD_PREFIX="Debug"
 fi
 
-
-PYTHON_TEST="`python src/Tools/python-path-test.py 2> /dev/null`"
-#echo "xxxxx ${PYTHON_TEST} xxxxx\n"
-if test PYTHON_TEST != "xOK";
-then
-    source ${BUILD_PREFIX}/python-version    
-    echo ""
-    echo "WARNING !!!"
-    echo ""
-    echo "Your system does not see TNL Python modules which were installed right now."
-    echo "You need to add it to your system variables PATH and LD_LIBRARY_PATH."
-    echo "Add the following to your .bashrc file:"
-    echo ""
-    echo "if test x\${PYTHONPATH} = x;"
-    echo "then"
-    echo "   PYTHONPATH=${PREFIX}/lib/python${PYTHON_VERSION}"
-    echo "else"
-    echo "   PYTHONPATH=\${PYTHONPATH}:${PREFIX}/lib/python${PYTHON_VERSION}"
-    echo "fi" 
-    echo "export PYTHONPATH" 
-fi
-
 exit 0
 
diff --git a/src/TNL/Assert.h b/src/TNL/Assert.h
index ddf2dc9bcd8b2098870d59905079f149eb736f84..f8f60200d0bdfdc4910334174bb7b9ecd3a63977 100644
--- a/src/TNL/Assert.h
+++ b/src/TNL/Assert.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          VectorOperationsTester.h  -  description
+                          Assert.h  -  description
                              -------------------
     begin                : Jan 12, 2010
     copyright            : (C) 2013 by Tomas Oberhuber
@@ -22,36 +22,40 @@
 
 #endif
 
-namespace TNL {
-
 #ifndef NDEBUG   
    
-#ifdef HAVE_CUDA
-#define Assert( ___tnl__assert_condition, ___tnl__assert_command )                                    \
-   if( ! ( ___tnl__assert_condition ) )                                                                  \
-   {                                                                                                     \
+#if defined( __NVCC__ ) && ( __CUDACC_VER__ < 80000 )
+    #define TNL_PRETTY_FUNCTION "(not known in CUDA 7.5 or older)"
+#else
+    #define TNL_PRETTY_FUNCTION __PRETTY_FUNCTION__
+#endif
+
+// __CUDA_ARCH__ is defined by the compiler only for code executed on GPU
+#ifdef __CUDA_ARCH__
+#define TNL_ASSERT( ___tnl__assert_condition, ___tnl__assert_command )                                     \
+   if( ! ( ___tnl__assert_condition ) )                                                                    \
+   {                                                                                                       \
    printf( "Assertion '%s' failed !!! \n File: %s \n Line: %d \n Diagnostics: Not supported with CUDA.\n", \
-           __STRING( ___tnl__assert_condition ),                                                         \
-           __FILE__,                                                                                     \
-           __LINE__ );                                                                                   \
-                                                              \
+           __STRING( ___tnl__assert_condition ),                                                           \
+           __FILE__,                                                                                       \
+           __LINE__ );                                                                                     \
+                                                                                                           \
    }
 
-#else // HAVE_CUDA
-#define Assert( ___tnl__assert_condition, ___tnl__assert_command )                       \
-	if( ! ( ___tnl__assert_condition ) )                                                     \
-	{                                                                                        \
-	std::cerr << "Assertion '" << __STRING( ___tnl__assert_condition ) << "' failed !!!" << std::endl  \
-             << "File: " << __FILE__ << std::endl                                                \
-             << "Function: " << __PRETTY_FUNCTION__ << std::endl                                 \
-             << "Line: " << __LINE__ << std::endl                                                \
-             << "Diagnostics: ";                                                            \
-        ___tnl__assert_command;                                                             \
-        throw EXIT_FAILURE;                                                                 \
-	}
-#endif /* HAVE_CUDA */
+#else // __CUDA_ARCH__
+#define TNL_ASSERT( ___tnl__assert_condition, ___tnl__assert_command )                                  \
+   if( ! ( ___tnl__assert_condition ) )                                                                 \
+   {                                                                                                    \
+   std::cerr << "Assertion '" << __STRING( ___tnl__assert_condition ) << "' failed !!!" << std::endl    \
+             << "File: " << __FILE__ << std::endl                                                       \
+             << "Function: " << TNL_PRETTY_FUNCTION << std::endl                                        \
+             << "Line: " << __LINE__ << std::endl                                                       \
+             << "Diagnostics: ";                                                                        \
+        ___tnl__assert_command;                                                                         \
+        throw EXIT_FAILURE;                                                                             \
+   }
+#endif // __CUDA_ARCH__
+
 #else /* #ifndef NDEBUG */
-#define Assert( ___tnl__assert_condition, ___tnl__assert_command )
+#define TNL_ASSERT( ___tnl__assert_condition, ___tnl__assert_command )
 #endif /* #ifndef NDEBUG */
-
-} // namespace TNL
diff --git a/src/TNL/CMakeLists.txt b/src/TNL/CMakeLists.txt
index d3371ab5e82b11bb763efbacb817f2333b85e09d..957120fdac7d916dad1309131e8fb714224ee7a3 100755
--- a/src/TNL/CMakeLists.txt
+++ b/src/TNL/CMakeLists.txt
@@ -1,5 +1,6 @@
 ADD_SUBDIRECTORY( Config )
 ADD_SUBDIRECTORY( Containers )
+ADD_SUBDIRECTORY( Debugging )
 ADD_SUBDIRECTORY( Devices )
 ADD_SUBDIRECTORY( Experimental )
 ADD_SUBDIRECTORY( Functions )
@@ -25,8 +26,6 @@ set( headers
      File_impl.h
      FileName.h
      Object.h
-     List.h
-     List_impl.h
      Logger.h
      Logger_impl.h
      Math.h
@@ -37,7 +36,6 @@ set( headers
      SmartPointersRegister.h
      StaticFor.h
      String.h
-     SystemInfo.h
      Timer.h
      UniquePointer.h )
 
@@ -48,7 +46,6 @@ set( common_SOURCES
      Logger.cpp
      SmartPointersRegister.cpp
      String.cpp
-     SystemInfo.cpp
      Timer.cpp )
      
 set( tnl_SOURCES ${tnl_config_SOURCES}
diff --git a/src/TNL/Config/ConfigDescription.cpp b/src/TNL/Config/ConfigDescription.cpp
index 6822aec9ce7d0551a2e598ef93f394d95e8bd57b..6145103ef64c155cd8b4387d381e135ce47cba70 100644
--- a/src/TNL/Config/ConfigDescription.cpp
+++ b/src/TNL/Config/ConfigDescription.cpp
@@ -129,7 +129,7 @@ bool Config::ConfigDescription :: checkMissingEntries( Config::ParameterContaine
 {
    int i;
    const int size = entries. getSize();
-   List< String > missingParameters;
+   Containers::List< String > missingParameters;
    for( i = 0; i < size; i ++ )
    {
       const char* entry_name = entries[ i ] -> name. getString();
diff --git a/src/TNL/Config/ConfigDescription.h b/src/TNL/Config/ConfigDescription.h
index 5c210fa6df578267f1134697092ba6cefd454dfb..9fab40b7e8591ef132330fd723cc434666c6210b 100644
--- a/src/TNL/Config/ConfigDescription.h
+++ b/src/TNL/Config/ConfigDescription.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/String.h>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/param-types.h>
 #include <TNL/Config/ConfigEntryType.h>
 #include <TNL/Config/ConfigEntry.h>
@@ -88,13 +88,13 @@ class ConfigDescription
    template< typename EntryType >
    void addEntryEnum( const EntryType& entryEnum )
    {
-      Assert( this->currentEntry,);
+      TNL_ASSERT( this->currentEntry,);
       ( ( ConfigEntry< EntryType >* ) currentEntry )->getEnumValues().Append( entryEnum );
    }
 
    void addEntryEnum( const char* entryEnum )
    {
-      Assert( this->currentEntry,);
+      TNL_ASSERT( this->currentEntry,);
       ( ( ConfigEntry< String >* ) currentEntry )->getEnumValues().Append( String( entryEnum ) );
    }
 
@@ -166,7 +166,7 @@ class ConfigDescription
 
    protected:
 
-   List< ConfigEntryBase* > entries;
+   Containers::List< ConfigEntryBase* > entries;
 
    ConfigEntryBase* currentEntry;
 
diff --git a/src/TNL/Config/ConfigEntry.h b/src/TNL/Config/ConfigEntry.h
index fa452fb4ea0ba23a6bc320434f8de170d9620c24..759a77ef3d52d991c4a1f879a7e1e5390023159d 100644
--- a/src/TNL/Config/ConfigEntry.h
+++ b/src/TNL/Config/ConfigEntry.h
@@ -11,6 +11,7 @@
 #pragma once 
 
 #include <TNL/Config/ConfigEntryBase.h>
+#include <TNL/Containers/List.h>
 
 namespace TNL {
 namespace Config {
@@ -20,7 +21,7 @@ struct ConfigEntry : public ConfigEntryBase
 {
    EntryType defaultValue;
 
-   List< EntryType > enumValues;
+   Containers::List< EntryType > enumValues;
 
    public:
 
@@ -61,7 +62,7 @@ struct ConfigEntry : public ConfigEntryBase
       return convertToString( defaultValue );
    };
 
-   List< EntryType >& getEnumValues()
+   Containers::List< EntryType >& getEnumValues()
    {
       return this->enumValues;
    }
diff --git a/src/TNL/Config/ConfigEntryList.h b/src/TNL/Config/ConfigEntryList.h
index 01db24d6d6543bda7b3a12d24fa0772b3313c5ff..e8fabf5c58338177e38fa8a1ef652400c2ac79c7 100644
--- a/src/TNL/Config/ConfigEntryList.h
+++ b/src/TNL/Config/ConfigEntryList.h
@@ -11,6 +11,7 @@
 #pragma once 
 
 #include <TNL/Config/ConfigEntryBase.h>
+#include <TNL/Containers/List.h>
 
 namespace TNL {
 namespace Config {  
@@ -20,7 +21,7 @@ struct ConfigEntryList : public ConfigEntryBase
 {
    EntryType defaultValue;
 
-   List< EntryType > enumValues;
+   Containers::List< EntryType > enumValues;
 
    public:
 
@@ -49,12 +50,12 @@ struct ConfigEntryList : public ConfigEntryBase
 
    String getEntryType() const
    {
-      return TNL::getType< List< EntryType > >();
+      return TNL::getType< Containers::List< EntryType > >();
    }
 
    String getUIEntryType() const
    {
-      return TNL::Config::getUIEntryType< List< EntryType > >();
+      return TNL::Config::getUIEntryType< Containers::List< EntryType > >();
    }
 
    String printDefaultValue() const
@@ -62,7 +63,7 @@ struct ConfigEntryList : public ConfigEntryBase
       return convertToString( defaultValue );
    };
 
-   List< EntryType >& getEnumValues()
+   Containers::List< EntryType >& getEnumValues()
    {
       return this->enumValues;
    }
@@ -84,7 +85,7 @@ struct ConfigEntryList : public ConfigEntryBase
      std::cout << " ";
    }
 
-   bool checkValue( const List< EntryType >& values ) const
+   bool checkValue( const Containers::List< EntryType >& values ) const
    {
       if( this->enumValues.getSize() != 0 )
       {
diff --git a/src/TNL/Config/ConfigEntryType.h b/src/TNL/Config/ConfigEntryType.h
index 0d104cb63add85408a70ecdbad400426e3ee1ee5..4ba052d022dd160ad633983d255b5dbfbb318edb 100644
--- a/src/TNL/Config/ConfigEntryType.h
+++ b/src/TNL/Config/ConfigEntryType.h
@@ -10,6 +10,8 @@
 
 #pragma once 
 
+#include <TNL/Containers/List.h>
+
 namespace TNL {
 namespace Config {
 
@@ -21,10 +23,10 @@ template<> inline String getUIEntryType< bool >()      { return "bool"; };
 template<> inline String getUIEntryType< int >()       { return "integer"; };
 template<> inline String getUIEntryType< double >()    { return "real"; };
 
-template<> inline String getUIEntryType< List< String > >() { return "list of string"; };
-template<> inline String getUIEntryType< List< bool > >()      { return "list of bool"; };
-template<> inline String getUIEntryType< List< int > >()       { return "list of integer"; };
-template<> inline String getUIEntryType< List< double > >()    { return "list of real"; };
+template<> inline String getUIEntryType< Containers::List< String > >() { return "list of string"; };
+template<> inline String getUIEntryType< Containers::List< bool > >()      { return "list of bool"; };
+template<> inline String getUIEntryType< Containers::List< int > >()       { return "list of integer"; };
+template<> inline String getUIEntryType< Containers::List< double > >()    { return "list of real"; };
 
 struct ConfigEntryType
 {
diff --git a/src/TNL/Config/ParameterContainer.cpp b/src/TNL/Config/ParameterContainer.cpp
index ef2b335dae862cef51cff879eb8cf26cfa8f4a3d..b2e77dd5e21d4747cb89dfddf6cb53f4ef423a16 100644
--- a/src/TNL/Config/ParameterContainer.cpp
+++ b/src/TNL/Config/ParameterContainer.cpp
@@ -200,7 +200,7 @@ parseCommandLine( int argc, char* argv[],
             std::cerr << "Missing value for the parameter " << option << "." << std::endl;
             return false;
          }
-         List< String > parsedEntryType;
+         Containers::List< String > parsedEntryType;
          if( ! parseObjectType( entryType, parsedEntryType ) )
          {
             std::cerr << "Internal error: Uknown config entry type " << entryType << "." << std::endl;
@@ -208,26 +208,26 @@ parseCommandLine( int argc, char* argv[],
          }
          if( parsedEntryType[ 0 ] == "List" )
          {
-            List< String >* string_list( 0 );
-            List< bool >* bool_list( 0 );
-            List< int >* integer_list( 0 );
-            List< double >* real_list( 0 );
+            Containers::List< String >* string_list( 0 );
+            Containers::List< bool >* bool_list( 0 );
+            Containers::List< int >* integer_list( 0 );
+            Containers::List< double >* real_list( 0 );
 
             if( parsedEntryType[ 1 ] == "String" )
-               string_list = new List< String >;
+               string_list = new Containers::List< String >;
             if( parsedEntryType[ 1 ] == "bool" )
-               bool_list = new List< bool >;
+               bool_list = new Containers::List< bool >;
             if( parsedEntryType[ 1 ] == "int" )
-               integer_list = new List< int >;
+               integer_list = new Containers::List< int >;
             if( parsedEntryType[ 1 ] == "double" )
-               real_list = new List< double >;
+               real_list = new Containers::List< double >;
  
             while( i < argc && ( ( argv[ i ] )[ 0 ] != '-' || ( atof( argv[ i ] ) < 0.0 && ( integer_list || real_list ) ) ) )
             {
                const char* value = argv[ i ++ ];
                if( string_list )
                {
-                  /*if( ! ( ( ConfigEntry< List< String > >* )  entry )->checkValue( String( value ) ) )
+                  /*if( ! ( ( ConfigEntry< Containers::List< String > >* )  entry )->checkValue( String( value ) ) )
                   {
                      delete string_list;
                      return false;
@@ -246,7 +246,7 @@ parseCommandLine( int argc, char* argv[],
                }
                if( integer_list )
                {
-                  /*if( ! ( ConfigEntry< List< int > >* ) entry->checkValue( atoi( value ) ) )
+                  /*if( ! ( ConfigEntry< Containers::List< int > >* ) entry->checkValue( atoi( value ) ) )
                   {
                      delete integer_list;
                      return false;
@@ -255,7 +255,7 @@ parseCommandLine( int argc, char* argv[],
                }
                if( real_list )
                {
-                  /*if( ! ( ConfigEntry< List< double > >* ) entry->checkValue( atof( value ) ) )
+                  /*if( ! ( ConfigEntry< Containers::List< double > >* ) entry->checkValue( atof( value ) ) )
                   {
                      delete real_list;
                      return false;
@@ -265,22 +265,22 @@ parseCommandLine( int argc, char* argv[],
             }
             if( string_list )
             {
-               parameters. addParameter< List< String > >( option, *string_list );
+               parameters. addParameter< Containers::List< String > >( option, *string_list );
                delete string_list;
             }
             if( bool_list )
             {
-               parameters. addParameter< List< bool > >( option, *bool_list );
+               parameters. addParameter< Containers::List< bool > >( option, *bool_list );
                delete bool_list;
             }
             if( integer_list )
             {
-               parameters. addParameter< List< int > >( option, *integer_list );
+               parameters. addParameter< Containers::List< int > >( option, *integer_list );
                delete integer_list;
             }
             if( real_list )
             {
-               parameters. addParameter< List< double > >( option, *real_list );
+               parameters. addParameter< Containers::List< double > >( option, *real_list );
                delete real_list;
             }
             if( i < argc ) i --;
diff --git a/src/TNL/Config/ParameterContainer.h b/src/TNL/Config/ParameterContainer.h
index 8230490a38de13b80613c201b046995b4ae1ae19..5f6df343000fe3a28eb1afe5edc1f54156ba451d 100644
--- a/src/TNL/Config/ParameterContainer.h
+++ b/src/TNL/Config/ParameterContainer.h
@@ -10,7 +10,7 @@
 
 #pragma once 
 
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/mpi-supp.h>
 #include <TNL/param-types.h>
@@ -96,7 +96,7 @@ class ParameterContainer
 
    protected:
 
-   List< tnlParameterBase* > parameters;
+   Containers::List< tnlParameterBase* > parameters;
 
 };
 
diff --git a/src/TNL/Containers/ArrayOperations.h b/src/TNL/Containers/Algorithms/ArrayOperations.h
similarity index 96%
rename from src/TNL/Containers/ArrayOperations.h
rename to src/TNL/Containers/Algorithms/ArrayOperations.h
index 042270d5b7455aeb06500a98ddc4b161613b477d..e32c7fd288c098ae2e648c1ad2fdbc7a6ccfc324 100644
--- a/src/TNL/Containers/ArrayOperations.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperations.h
@@ -15,6 +15,7 @@
 
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 template< typename DestinationDevice,
           typename SourceDevice = DestinationDevice >
@@ -64,7 +65,6 @@ class ArrayOperations< Devices::Host >
    static bool compareMemory( const Element1* destination,
                               const Element2* source,
                               const Index size );
-
 };
 
 template<>
@@ -152,8 +152,9 @@ class ArrayOperations< Devices::Host, Devices::Cuda >
                               const Index size );
 };
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
-#include <TNL/Containers/ArrayOperationsHost_impl.h>
-#include <TNL/Containers/ArrayOperationsCuda_impl.h>
+#include <TNL/Containers/Algorithms/ArrayOperationsHost_impl.h>
+#include <TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h>
diff --git a/src/TNL/Containers/ArrayOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
similarity index 90%
rename from src/TNL/Containers/ArrayOperationsCuda_impl.h
rename to src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
index 98dd3526891dd929f94da27e7bccc20c564ccfe0..74bf0a73005e86015b443d4c99a6274d39ab2077 100644
--- a/src/TNL/Containers/ArrayOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h
@@ -11,17 +11,22 @@
 #pragma once 
 
 #include <iostream>
+
 #include <TNL/tnlConfig.h>
 #include <TNL/Math.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
 #include <TNL/Containers/Algorithms/reduction-operations.h>
 
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 template< typename Element, typename Index >
-bool ArrayOperations< Devices::Cuda >::allocateMemory( Element*& data,
-                                                    const Index size )
+bool
+ArrayOperations< Devices::Cuda >::
+allocateMemory( Element*& data,
+                const Index size )
 {
 #ifdef HAVE_CUDA
    checkCudaDevice;
@@ -36,9 +41,11 @@ bool ArrayOperations< Devices::Cuda >::allocateMemory( Element*& data,
 }
 
 template< typename Element >
-bool ArrayOperations< Devices::Cuda >::freeMemory( Element* data )
+bool
+ArrayOperations< Devices::Cuda >::
+freeMemory( Element* data )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
 #ifdef HAVE_CUDA
       checkCudaDevice;
       cudaFree( data );
@@ -50,42 +57,51 @@ bool ArrayOperations< Devices::Cuda >::freeMemory( Element* data )
 }
 
 template< typename Element >
-void ArrayOperations< Devices::Cuda >::setMemoryElement( Element* data,
-                                                      const Element& value )
+void
+ArrayOperations< Devices::Cuda >::
+setMemoryElement( Element* data,
+                  const Element& value )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
    ArrayOperations< Devices::Cuda >::setMemory( data, value, 1 );
 }
 
 template< typename Element >
-Element ArrayOperations< Devices::Cuda >::getMemoryElement( const Element* data )
+Element
+ArrayOperations< Devices::Cuda >::
+getMemoryElement( const Element* data )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
    Element result;
    ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< Element, Element, int >( &result, data, 1 );
    return result;
 }
 
 template< typename Element, typename Index >
-Element& ArrayOperations< Devices::Cuda >::getArrayElementReference( Element* data, const Index i )
+Element&
+ArrayOperations< Devices::Cuda >::
+getArrayElementReference( Element* data, const Index i )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
    return data[ i ];
 }
 
 template< typename Element, typename Index >
-const Element& ArrayOperations< Devices::Cuda >::getArrayElementReference( const Element* data, const Index i )
+const
+Element& ArrayOperations< Devices::Cuda >::
+getArrayElementReference( const Element* data, const Index i )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
    return data[ i ];
 }
 
 
 #ifdef HAVE_CUDA
 template< typename Element, typename Index >
-__global__ void setArrayValueCudaKernel( Element* data,
-                                         const Index size,
-                                         const Element value )
+__global__ void
+setArrayValueCudaKernel( Element* data,
+                         const Index size,
+                         const Element value )
 {
    Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
    const Index maxGridSize = blockDim. x * gridDim. x;
@@ -98,11 +114,13 @@ __global__ void setArrayValueCudaKernel( Element* data,
 #endif
 
 template< typename Element, typename Index >
-bool ArrayOperations< Devices::Cuda >::setMemory( Element* data,
-                    const Element& value,
-                    const Index size )
+bool
+ArrayOperations< Devices::Cuda >::
+setMemory( Element* data,
+           const Element& value,
+           const Index size )
 {
-   Assert( data, );
+   TNL_ASSERT( data, );
 #ifdef HAVE_CUDA
    dim3 blockSize( 0 ), gridSize( 0 );
    blockSize. x = 256;
@@ -120,9 +138,10 @@ bool ArrayOperations< Devices::Cuda >::setMemory( Element* data,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-__global__ void copyMemoryCudaToCudaKernel( DestinationElement* destination,
-                                            const SourceElement* source,
-                                            const Index size )
+__global__ void
+copyMemoryCudaToCudaKernel( DestinationElement* destination,
+                            const SourceElement* source,
+                            const Index size )
 {
    Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
    const Index maxGridSize = blockDim. x * gridDim. x;
@@ -137,12 +156,14 @@ __global__ void copyMemoryCudaToCudaKernel( DestinationElement* destination,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool ArrayOperations< Devices::Cuda >::copyMemory( DestinationElement* destination,
-                                                         const SourceElement* source,
-                                                         const Index size )
+bool
+ArrayOperations< Devices::Cuda >::
+copyMemory( DestinationElement* destination,
+            const SourceElement* source,
+            const Index size )
 {
-   Assert( destination, );
-   Assert( source, );
+   TNL_ASSERT( destination, );
+   TNL_ASSERT( source, );
    #ifdef HAVE_CUDA
       if( std::is_same< DestinationElement, SourceElement >::value )
       {
@@ -170,12 +191,14 @@ bool ArrayOperations< Devices::Cuda >::copyMemory( DestinationElement* destinati
 template< typename Element1,
           typename Element2,
           typename Index >
-bool ArrayOperations< Devices::Cuda >::compareMemory( const Element1* destination,
-                                                   const Element2* source,
-                                                   const Index size )
+bool
+ArrayOperations< Devices::Cuda >::
+compareMemory( const Element1* destination,
+               const Element2* source,
+               const Index size )
 {
-   Assert( destination, );
-   Assert( source, );
+   TNL_ASSERT( destination, );
+   TNL_ASSERT( source, );
    //TODO: The parallel reduction on the CUDA device with different element types is needed.
    bool result;
    Algorithms::tnlParallelReductionEqualities< Element1, Index > reductionEqualities;
@@ -190,12 +213,14 @@ bool ArrayOperations< Devices::Cuda >::compareMemory( const Element1* destinatio
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( DestinationElement* destination,
-                                                         const SourceElement* source,
-                                                         const Index size )
+bool
+ArrayOperations< Devices::Host, Devices::Cuda >::
+copyMemory( DestinationElement* destination,
+            const SourceElement* source,
+            const Index size )
 {
-   Assert( destination, );
-   Assert( source, );
+   TNL_ASSERT( destination, );
+   TNL_ASSERT( source, );
    #ifdef HAVE_CUDA
    if( std::is_same< DestinationElement, SourceElement >::value )
    {
@@ -251,16 +276,18 @@ bool ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( DestinationEle
 template< typename Element1,
           typename Element2,
           typename Index >
-bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( const Element1* destination,
-                                                            const Element2* source,
-                                                            const Index size )
+bool
+ArrayOperations< Devices::Host, Devices::Cuda >::
+compareMemory( const Element1* destination,
+               const Element2* source,
+               const Index size )
 {
    /***
     * Here, destination is on host and source is on CUDA device.
     */
-   Assert( destination, );
-   Assert( source, );
-   Assert( size >= 0, std::cerr << "size = " << size );
+   TNL_ASSERT( destination, );
+   TNL_ASSERT( source, );
+   TNL_ASSERT( size >= 0, std::cerr << "size = " << size );
    #ifdef HAVE_CUDA
    Element2* host_buffer = new Element2[ Devices::Cuda::getGPUTransferBufferSize() ];
    if( ! host_buffer )
@@ -303,13 +330,15 @@ bool ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( const Eleme
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory( DestinationElement* destination,
-                                                         const SourceElement* source,
-                                                         const Index size )
+bool
+ArrayOperations< Devices::Cuda, Devices::Host >::
+copyMemory( DestinationElement* destination,
+            const SourceElement* source,
+            const Index size )
 {
-   Assert( destination, );
-   Assert( source, );
-   Assert( size >= 0, std::cerr << "size = " << size );
+   TNL_ASSERT( destination, );
+   TNL_ASSERT( source, );
+   TNL_ASSERT( size >= 0, std::cerr << "size = " << size );
    #ifdef HAVE_CUDA
    if( std::is_same< DestinationElement, SourceElement >::value )
    {
@@ -364,13 +393,15 @@ bool ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory( DestinationEle
 template< typename Element1,
           typename Element2,
           typename Index >
-bool ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory( const Element1* hostData,
-                                                            const Element2* deviceData,
-                                                            const Index size )
+bool
+ArrayOperations< Devices::Cuda, Devices::Host >::
+compareMemory( const Element1* hostData,
+               const Element2* deviceData,
+               const Index size )
 {
-   Assert( hostData, );
-   Assert( deviceData, );
-   Assert( size >= 0, std::cerr << "size = " << size );
+   TNL_ASSERT( hostData, );
+   TNL_ASSERT( deviceData, );
+   TNL_ASSERT( size >= 0, std::cerr << "size = " << size );
    return ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory( deviceData, hostData, size );
 }
 
@@ -651,5 +682,6 @@ extern template bool ArrayOperations< Devices::Cuda >::setMemory< long double, l
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayOperationsHost_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
similarity index 91%
rename from src/TNL/Containers/ArrayOperationsHost_impl.h
rename to src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
index 6bebb34a4e5ff52c5f6cb4ba6804a3073e7ad15f..8d6df347765c636f67bd6122801fa26971670501 100644
--- a/src/TNL/Containers/ArrayOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost_impl.h
@@ -11,15 +11,20 @@
 #pragma once 
 
 #include <type_traits>
-#include <TNL/tnlConfig.h>
 #include <string.h>
 
+#include <TNL/tnlConfig.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 template< typename Element, typename Index >
-bool ArrayOperations< Devices::Host >::allocateMemory( Element*& data,
-                                                      const Index size )
+bool
+ArrayOperations< Devices::Host >::
+allocateMemory( Element*& data,
+                const Index size )
 {
    if( ! ( data = new Element[ size ] ) )
       return false;
@@ -27,42 +32,54 @@ bool ArrayOperations< Devices::Host >::allocateMemory( Element*& data,
 }
 
 template< typename Element >
-bool ArrayOperations< Devices::Host >::freeMemory( Element* data )
+bool
+ArrayOperations< Devices::Host >::
+freeMemory( Element* data )
 {
    delete[] data;
    return true;
 }
 template< typename Element >
-void ArrayOperations< Devices::Host >::setMemoryElement( Element* data,
-                                                        const Element& value )
+void
+ArrayOperations< Devices::Host >::
+setMemoryElement( Element* data,
+                  const Element& value )
 {
    *data = value;
 };
 
 template< typename Element >
-Element ArrayOperations< Devices::Host >::getMemoryElement( Element* data )
+Element
+ArrayOperations< Devices::Host >::
+getMemoryElement( Element* data )
 {
    return *data;
 };
 
 template< typename Element, typename Index >
-Element& ArrayOperations< Devices::Host >::getArrayElementReference( Element* data,
-                                                                  const Index i )
+Element&
+ArrayOperations< Devices::Host >::
+getArrayElementReference( Element* data,
+                          const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
-const Element& ArrayOperations< Devices::Host >::getArrayElementReference( const Element* data,
-                                                                       const Index i )
+const Element&
+ArrayOperations< Devices::Host >::
+getArrayElementReference( const Element* data,
+                          const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
-bool ArrayOperations< Devices::Host >::setMemory( Element* data,
-                                               const Element& value,
-                                               const Index size )
+bool
+ArrayOperations< Devices::Host >::
+setMemory( Element* data,
+           const Element& value,
+           const Index size )
 {
    for( Index i = 0; i < size; i ++ )
       data[ i ] = value;
@@ -72,9 +89,11 @@ bool ArrayOperations< Devices::Host >::setMemory( Element* data,
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool ArrayOperations< Devices::Host >::copyMemory( DestinationElement* destination,
-                                                const SourceElement* source,
-                                                const Index size )
+bool
+ArrayOperations< Devices::Host >::
+copyMemory( DestinationElement* destination,
+            const SourceElement* source,
+            const Index size )
 {
    if( std::is_same< DestinationElement, SourceElement >::value )
       memcpy( destination, source, size * sizeof( DestinationElement ) );
@@ -87,9 +106,11 @@ bool ArrayOperations< Devices::Host >::copyMemory( DestinationElement* destinati
 template< typename DestinationElement,
           typename SourceElement,
           typename Index >
-bool ArrayOperations< Devices::Host >::compareMemory( const DestinationElement* destination,
-                                                   const SourceElement* source,
-                                                   const Index size )
+bool
+ArrayOperations< Devices::Host >::
+compareMemory( const DestinationElement* destination,
+               const SourceElement* source,
+               const Index size )
 {
    if( std::is_same< DestinationElement, SourceElement >::value )
    {
@@ -284,5 +305,6 @@ extern template bool ArrayOperations< Devices::Host >::setMemory< long double, l
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/CMakeLists.txt b/src/TNL/Containers/Algorithms/CMakeLists.txt
index ea0148af125ed2c1aa9ce973b54f149ab151f892..4b2744aced7e565692ee3a2469580fff1f7186a8 100755
--- a/src/TNL/Containers/Algorithms/CMakeLists.txt
+++ b/src/TNL/Containers/Algorithms/CMakeLists.txt
@@ -1,50 +1,22 @@
-set( headers cuda-prefix-sum.h
+ADD_SUBDIRECTORY( TemplateExplicitInstantiation )
+
+set( headers ArrayOperations.h
+             ArrayOperationsHost_impl.h
+             ArrayOperationsCuda_impl.h
+             cuda-prefix-sum.h
              cuda-prefix-sum_impl.h
-             cuda-reduction.h             
-             cuda-reduction_impl.h
              reduction-operations.h
-             CudaReduction.h
-             CudaReduction_impl.h
-             CudaReductionBuffer.h
              CublasWrapper.h
              CudaMultireductionKernel.h
+             CudaReductionBuffer.h
+             CudaReductionKernel.h
              Multireduction.h
              Multireduction_impl.h
+             Reduction.h
+             Reduction_impl.h
+             VectorOperations.h
+             VectorOperationsHost_impl.h
+             VectorOperationsCuda_impl.h
    )
 
-SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Containers/Algorithms ) 
-IF( BUILD_CUDA )
-   set( tnl_core_cuda_CUDA__SOURCES
-        ${common_SOURCES}
-        ${CURRENT_DIR}/cuda-reduction-sum_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-min_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-max_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-abs-sum_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-abs-min_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-abs-max_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-and_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-or_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-l2-norm_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-lp-norm_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-equalities_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-inequalities_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-scalar-product_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-sum_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-min_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-max_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-abs-sum_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-abs-min_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-abs-max_impl.cu
-        ${CURRENT_DIR}/cuda-reduction-diff-l2-norm_impl.cu        
-        ${CURRENT_DIR}/cuda-reduction-diff-lp-norm_impl.cu        
-        ${CURRENT_DIR}/cuda-prefix-sum_impl.cu
-        PARENT_SCOPE ) 
-endif() 
-
-set( tnl_core_cuda_SOURCES
-     ${common_SOURCES}
-     ${CURRENT_DIR}/cuda-reduction_impl.cpp
-     ${CURRENT_DIR}/cuda-prefix-sum_impl.cpp     
-     PARENT_SCOPE )               
-        
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/TNL/Containers/Algorithms )
diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index 8a1280f536796eef6d7997578db16f05a689a63b..5ec18f3c13a5227fe081d8f0757dd4e62b8e994a 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          CudaMultireductionKernel.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #ifdef HAVE_CUDA
@@ -29,7 +41,7 @@ static constexpr int Multireduction_maxThreadsPerBlock = 256;  // must be a powe
 template< typename Operation, int blockSizeX >      
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
-CudaMultireductionKernel( Operation& operation,
+CudaMultireductionKernel( Operation operation,
                           const typename Operation::IndexType n,
                           const typename Operation::IndexType size,
                           const typename Operation::RealType* input1,
@@ -40,9 +52,7 @@ CudaMultireductionKernel( Operation& operation,
    typedef typename Operation::IndexType IndexType;
    typedef typename Operation::ResultType ResultType;
 
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
+   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
    /***
     * Get thread id (tid) and global element id (gid).
@@ -216,10 +226,10 @@ CudaMultireductionKernelLauncher( Operation& operation,
       throw 1;
    }
 
-   // create reference to the reduction buffer singleton and set default size
+   // create reference to the reduction buffer singleton and set size
    // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
    const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
-   CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance();
+   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
    if( ! cudaReductionBuffer.setSize( buf_size ) )
       throw 1;
    output = cudaReductionBuffer.template getData< ResultType >();
@@ -290,9 +300,9 @@ CudaMultireductionKernelLauncher( Operation& operation,
          <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
          break;
       case   1:
-         Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
+         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
       default:
-         Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl );
+         TNL_ASSERT( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl );
    }
    checkCudaDevice;
 
diff --git a/src/TNL/Containers/Algorithms/CudaReduction.h b/src/TNL/Containers/Algorithms/CudaReduction.h
deleted file mode 100644
index 3fab192e9891777fa7472b69497b22c9f52325fe..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/CudaReduction.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/***************************************************************************
-                          CudaReduction.h  -  description
-                             -------------------
-    begin                : Jun 17, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-   
-#ifdef HAVE_CUDA
-
-template< typename Operation, int blockSize >
-class CudaReduction
-{
-   public:
-
-      typedef typename Operation::IndexType IndexType;
-      typedef typename Operation::RealType RealType;
-      typedef typename Operation::ResultType ResultType;
-
- 
-      __device__ static void reduce( Operation& operation,
-                                     const IndexType size,
-                                     const RealType* input1,
-                                     const RealType* input2,
-                                     ResultType* output );
-};
- 
-/*template< typename Real, typename Index, int blockSize >
-class CudaReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >
-{
-   public:
- 
-      typedef tnlParallelReductionScalarProduct< Real, Index > Operation;
-      typedef typename Operation::IndexType IndexType;
-      typedef typename Operation::RealType RealType;
-      typedef typename Operation::ResultType ResultType;
- 
-      __device__ static void reduce( Operation operation,
-                                     const IndexType size,
-                                     const RealType* input1,
-                                     const RealType* input2,
-                                     ResultType* output );
-};*/
-
-#endif
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
-
-#ifdef HAVE_CUDA
-#include <TNL/Containers/Algorithms/CudaReduction_impl.h>
-#endif
-
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2ab7e2cebc348e0797cf11a88775a3e7b5c388d
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -0,0 +1,283 @@
+/***************************************************************************
+                          CudaReductionKernel.h  -  description
+                             -------------------
+    begin                : Jun 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#endif
+
+#include <TNL/Assert.h>
+#include <TNL/Math.h>
+#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+#ifdef HAVE_CUDA
+/****
+ * The performance of this kernel is very sensitive to register usage.
+ * Compile with --ptxas-options=-v and configure these constants for given
+ * architecture so that there are no local memory spills.
+ */
+static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
+#if (__CUDA_ARCH__ >= 300 )
+   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
+#else
+   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
+#endif
+
+template< typename Operation, int blockSize >
+__global__ void
+__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
+CudaReductionKernel( Operation operation,
+                     const typename Operation::IndexType size,
+                     const typename Operation::RealType* input1,
+                     const typename Operation::RealType* input2,
+                     typename Operation::ResultType* output )
+{
+   typedef typename Operation::IndexType IndexType;
+   typedef typename Operation::ResultType ResultType;
+
+   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
+
+   /***
+    * Get thread id (tid) and global thread id (gid).
+    * gridSize is the number of element processed by all blocks at the
+    * same time.
+    */
+   const IndexType tid = threadIdx.x;
+         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   sdata[ tid ] = operation.initialValue();
+   /***
+    * Read data into the shared memory. We start with the
+    * sequential reduction.
+    */
+   while( gid + 4 * gridSize < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
+      gid += 4 * gridSize;
+   }
+   while( gid + 2 * gridSize < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      gid += 2 * gridSize;
+   }
+   while( gid < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      gid += gridSize;
+   }
+   __syncthreads();
+
+
+   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
+
+   //return;
+   /***
+    *  Perform the parallel reduction.
+    */
+   if( blockSize >= 1024 )
+   {
+      if( tid < 512 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 512 )
+   {
+      if( tid < 256 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 256 )
+   {
+      if( tid < 128 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
+      __syncthreads();
+      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+   if( blockSize >= 128 )
+   {
+      if( tid <  64 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
+      __syncthreads();
+      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+
+   /***
+    * This runs in one warp so it is synchronized implicitly.
+    */
+   if( tid < 32 )
+   {
+      volatile ResultType* vsdata = sdata;
+      if( blockSize >= 64 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
+         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 32 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
+         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 16 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
+         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  8 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
+         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  4 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
+         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  2 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
+         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+   }
+
+   /***
+    * Store the result back in the global memory.
+    */
+   if( tid == 0 )
+   {
+      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
+      output[ blockIdx.x ] = sdata[ 0 ];
+   }
+
+}
+
+template< typename Operation >
+typename Operation::IndexType
+CudaReductionKernelLauncher( Operation& operation,
+                             const typename Operation::IndexType size,
+                             const typename Operation::RealType* input1,
+                             const typename Operation::RealType* input2,
+                             typename Operation::ResultType*& output )
+{
+   typedef typename Operation::IndexType IndexType;
+   typedef typename Operation::RealType RealType;
+   typedef typename Operation::ResultType ResultType;
+
+   // The number of blocks should be a multiple of the number of multiprocessors
+   // to ensure optimum balancing of the load. This is very important, because
+   // we run the kernel with a fixed number of blocks, so the amount of work per
+   // block increases with enlarging the problem, so even small imbalance can
+   // cost us dearly.
+   // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
+//   const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
+//                                   * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+   // On Tesla K40c, desGridSize = 6 * 15 = 90.
+   const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
+                               * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+   dim3 blockSize( 256 ), gridSize( 0 );
+   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+
+   // create reference to the reduction buffer singleton and set size
+   const size_t buf_size = desGridSize * sizeof( ResultType );
+   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+   if( ! cudaReductionBuffer.setSize( buf_size ) )
+      throw 1;
+   output = cudaReductionBuffer.template getData< ResultType >();
+
+   // when there is only one warp per blockSize.x, we need to allocate two warps
+   // worth of shared memory so that we don't index shared memory out of bounds
+   const IndexType shmem = (blockSize.x <= 32)
+            ? 2 * blockSize.x * sizeof( ResultType )
+            : blockSize.x * sizeof( ResultType );
+
+   /***
+    * Depending on the blockSize we generate appropriate template instance.
+    */
+   switch( blockSize.x )
+   {
+      case 512:
+         CudaReductionKernel< Operation, 512 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 256:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation, 256 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 128:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation, 128 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  64:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  64 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,  64 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  32:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  32 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,  32 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  16:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  16 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,  16 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+     case   8:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   8 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,   8 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   4:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   4 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,   4 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   2:
+         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   2 >, cudaFuncCachePreferShared);
+
+         CudaReductionKernel< Operation,   2 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   1:
+         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+      default:
+         TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+   }
+   checkCudaDevice;
+
+   // return the size of the output array on the CUDA device
+   return gridSize.x;
+}
+#endif
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/CudaReduction_impl.h b/src/TNL/Containers/Algorithms/CudaReduction_impl.h
deleted file mode 100644
index ad336a0a54cd2a97c3c1a3b9bd4dc6428c10a7b9..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/CudaReduction_impl.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/***************************************************************************
-                          CudaReduction_impl.h  -  description
-                             -------------------
-    begin                : Jun 17, 2015
-    copyright            : (C) 2015 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< typename Operation, int blockSize >
-__device__
-void
-CudaReduction< Operation, blockSize >::
-reduce( Operation& operation,
-        const IndexType size,
-        const RealType* input1,
-        const RealType* input2,
-        ResultType* output )
-{
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
-
-   /***
-    * Get thread id (tid) and global thread id (gid).
-    * gridSize is the number of element processed by all blocks at the
-    * same time.
-    */
-   IndexType tid = threadIdx. x;
-   IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
-   IndexType gridSize = blockDim. x * gridDim.x;
-
-   sdata[ tid ] = operation.initialValue();
-   /***
-    * Read data into the shared memory. We start with the
-    * sequential reduction.
-    */
-   while( gid + 4 * gridSize < size )
-   {
-      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
-      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
-      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
-      gid += 4*gridSize;
-   }
-   while( gid + 2 * gridSize < size )
-   {
-      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
-      gid += 2*gridSize;
-   }
-   while( gid < size )
-   {
-      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
-      gid += gridSize;
-   }
-   __syncthreads();
-
-
-   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   //return;
-   /***
-    *  Perform the parallel reduction.
-    */
-   if( blockSize >= 1024 )
-   {
-      if( tid < 512 )
-         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 512 )
-   {
-      if( tid < 256 )
-         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 256 )
-   {
-      if( tid < 128 )
-         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
-      __syncthreads();
-      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
-   }
-
-   if( blockSize >= 128 )
-   {
-      if( tid <  64 )
-         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
-      __syncthreads();
-      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
-   }
-
-
-   /***
-    * This runs in one warp so it is synchronized implicitly.
-    */
-   if( tid < 32 )
-   {
-      volatile ResultType* vsdata = sdata;
-      if( blockSize >= 64 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
-         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >= 32 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
-         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >= 16 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
-         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  8 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
-         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  4 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
-         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  2 )
-      {
-         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
-         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-   }
-
-   /***
-    * Store the result back in the global memory.
-    */
-   if( tid == 0 )
-   {
-      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
-      output[ blockIdx.x ] = sdata[ 0 ];
-   }
-
-}
-
-#ifdef UNDEF
-
-template< typename Real, typename Index, int blockSize >
-__device__
-void
-CudaReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
-reduce( Operation& operation,
-        const IndexType size,
-        const RealType* input1,
-        const RealType* input2,
-        ResultType* output )
-{
-  extern __shared__ __align__ ( 8 ) char __sdata[];
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
-
-   /***
-    * Get thread id (tid) and global thread id (gid).
-    * gridSize is the number of element processed by all blocks at the
-    * same time.
-    */
-   IndexType tid = threadIdx. x;
-   IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
-   IndexType gridSize = blockDim. x * gridDim.x;
-
-   /***
-    * Read data into the shared memory. We start with the
-    * sequential reduction.
-    */
-   sdata[ tid ] = ( RealType ) 0;
-   /*while( gid + 4 * gridSize < size )
-   {
-      sdata[ tid ] += input1[ gid                ] * input2[ gid ];
-      sdata[ tid ] += input1[ gid + gridSize     ] * input2[ gid + gridSize ];
-      sdata[ tid ] += input1[ gid + 2 * gridSize ] * input2[ gid + 2 * gridSize ];
-      sdata[ tid ] += input1[ gid + 3 * gridSize ] * input2[ gid + 3 * gridSize ];
-      gid += 4*gridSize;
-   }
-   while( gid + 2 * gridSize < size )
-   {
-      sdata[ tid ] += input1[ gid            ] * input2[ gid ];
-      sdata[ tid ] += input1[ gid + gridSize ] * input2[ gid + gridSize ];
-      gid += 2*gridSize;
-   }*/
-   while( gid < size )
-   {
-      sdata[ tid ] += input1[ gid ] * input2[ gid ];
-      gid += gridSize;
-   }
-   __syncthreads();
-
-   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   /***
-    *  Perform the parallel reduction.
-    */
-   if( blockSize >= 1024 )
-   {
-      if( tid < 512 )
-         sdata[ tid ] += sdata[ tid + 512 ];
-      __syncthreads();
-   }
-   if( blockSize >= 512 )
-   {
-      if( tid < 256 )
-         sdata[ tid ] += sdata[ tid + 256 ];
-      __syncthreads();
-   }
-   if( blockSize >= 256 )
-   {
-      if( tid < 128 )
-         sdata[ tid ] += sdata[ tid + 128 ];
-      __syncthreads();
-      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
-   }
-
-   if( blockSize >= 128 )
-   {
-      if( tid <  64 )
-         sdata[ tid ] += sdata[ tid + 64 ];
-      __syncthreads();
-      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
-   }
-
-   /***
-    * This runs in one warp so it is synchronized implicitly.
-    */
-   if( tid < 32 )
-   {
-      volatile ResultType* vsdata = sdata;
-      if( blockSize >= 64 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 32 ];
-         //__syncthreads();
-         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >= 32 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 16 ];
-         //__syncthreads();
-         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >= 16 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 8 ];
-         //__syncthreads();
-         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  8 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 4 ];
-         //__syncthreads();
-         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  4 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 2 ];
-         //__syncthreads();
-         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-      if( blockSize >=  2 )
-      {
-         vsdata[ tid ] += vsdata[ tid + 1 ];
-         //__syncthreads();
-         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
-      }
-   }
-
-   /***
-    * Store the result back in the global memory.
-    */
-   if( tid == 0 )
-   {
-      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
-      output[ blockIdx.x ] = sdata[ 0 ];
-   }
-}
-
-#endif
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
-
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 4eb3cf7fcdba79cfe989a7434227c9fe5104f1d9..9087db93ba6bfded63438c2e59175cc80794042c 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          Multireduction.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <TNL/Devices/Host.h>
diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h
index 1b9076c38d5f31ccc77f65c18e1a241ad741d75a..01db7844985447ac883020dde9478987c4d3b2a9 100644
--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          Multireduction_impl.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include "Multireduction.h"
@@ -6,7 +18,7 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
@@ -47,8 +59,8 @@ reduce( Operation& operation,
         typename Operation::ResultType* hostResult )
 {
 #ifdef HAVE_CUDA
-   Assert( n > 0, );
-   Assert( size <= ldInput1, );
+   TNL_ASSERT( n > 0, );
+   TNL_ASSERT( size <= ldInput1, );
 
    typedef typename Operation::IndexType IndexType;
    typedef typename Operation::RealType RealType;
@@ -65,7 +77,7 @@ reduce( Operation& operation,
          return false;
       if( deviceInput2 ) {
          RealType hostArray2[ Multireduction_minGpuDataSize ];
-         if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) )
+         if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
             return false;
          return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
       }
@@ -93,7 +105,7 @@ reduce( Operation& operation,
                                                                    deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
-      cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl;
+      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
       timer.reset();
       timer.start();
    #endif
@@ -107,18 +119,18 @@ reduce( Operation& operation,
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
-      cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl;
+      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
       timer.reset();
       timer.start();
    #endif
 
-//   cout << "resultArray = [";
+//   std::cout << "resultArray = [";
 //   for( int i = 0; i < n * reducedSize; i++ ) {
-//      cout << resultArray[ i ];
+//      std::cout << resultArray[ i ];
 //      if( i < n * reducedSize - 1 )
-//         cout << ", ";
+//         std::cout << ", ";
 //   }
-//   cout << "]" << endl;
+//   std::cout << "]" << std::endl;
 
    /***
     * Reduce the data on the host system.
@@ -128,7 +140,7 @@ reduce( Operation& operation,
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
-      cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl;
+      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
    #endif
 
    return checkCudaDevice;
@@ -159,8 +171,8 @@ reduce( Operation& operation,
         const typename Operation::RealType* input2,
         typename Operation::ResultType* result )
 {
-   Assert( n > 0, );
-   Assert( size <= ldInput1, );
+   TNL_ASSERT( n > 0, );
+   TNL_ASSERT( size <= ldInput1, );
 
    typedef typename Operation::IndexType IndexType;
    typedef typename Operation::RealType RealType;
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
similarity index 68%
rename from src/TNL/Containers/Algorithms/cuda-reduction.h
rename to src/TNL/Containers/Algorithms/Reduction.h
index 093c233c165c50e0c73401edff739374279d2158..242d1b7b83fbf8d71aa545630975561aed65c736 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          cuda-reduction.h  -  description
+                          Reduction.h  -  description
                              -------------------
     begin                : Oct 28, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
+    copyright            : (C) 2010 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -14,6 +14,14 @@ namespace TNL {
 namespace Containers {
 namespace Algorithms {   
 
+// TODO: rename to
+//   template< typename Device >
+//   class Reduction
+//   {};
+//
+// and make a specialization for Devices::Host (as it is done in Multireduction.h)
+// It should be as fast as all the manual implementations in VectorOperations.
+
 template< typename Operation >
 bool reductionOnCudaDevice( const Operation& operation,
                             const typename Operation :: IndexType size,
@@ -25,5 +33,4 @@ bool reductionOnCudaDevice( const Operation& operation,
 } // namespace Containers
 } // namespace TNL
 
-#include <TNL/Containers/Algorithms/cuda-reduction_impl.h>
-
+#include <TNL/Containers/Algorithms/Reduction_impl.h>
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h
similarity index 96%
rename from src/TNL/Containers/Algorithms/cuda-reduction_impl.h
rename to src/TNL/Containers/Algorithms/Reduction_impl.h
index dce163b67ebb115d9b58a5186899dc6ba2990d9a..cd4d636c824481c56c56e30fb6f285045df0fe7c 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          cuda-reduction_impl.h  -  description
+                          Reduction_impl.h  -  description
                              -------------------
     begin                : Mar 24, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -10,17 +10,14 @@
 
 #pragma once 
 
+#include "Reduction.h"
+
 //#define CUDA_REDUCTION_PROFILING
 
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
 #include <TNL/Assert.h>
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/ArrayOperations.h>
-#include <TNL/Math.h>
-#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
-#include <TNL/Containers/Algorithms/CudaReduction.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/CudaReductionKernel.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
 #include <iostream>
@@ -39,95 +36,6 @@ namespace Algorithms {
  */
 const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;
 
-#ifdef HAVE_CUDA
-
-template< typename Operation, int blockSize >
-__global__ void
-CudaReductionKernel( Operation operation,
-                     const typename Operation::IndexType size,
-                     const typename Operation::RealType* input1,
-                     const typename Operation::RealType* input2,
-                     typename Operation::ResultType* output )
-{
-   typedef CudaReduction< Operation, blockSize > Reduction;
-   Reduction::reduce( operation, size, input1, input2, output );
-};
-
-template< typename Operation >
-typename Operation::IndexType
-reduceOnCudaDevice( Operation& operation,
-                    const typename Operation::IndexType size,
-                    const typename Operation::RealType* input1,
-                    const typename Operation::RealType* input2,
-                    typename Operation::ResultType*& output)
-{
-   typedef typename Operation::IndexType IndexType;
-   typedef typename Operation::RealType RealType;
-   typedef typename Operation::ResultType ResultType;
- 
-   const IndexType desGridSize( minGPUReductionDataSize );
-   dim3 blockSize( 256 ), gridSize( 0 );
-   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
- 
-   // create reference to the reduction buffer singleton and set default size
-   CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize );
- 
-   if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
-      return false;
-   output = cudaReductionBuffer.template getData< ResultType >();
-   IndexType shmem = blockSize.x * sizeof( ResultType );
- 
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
-   switch( blockSize.x )
-   {
-      case 512:
-         CudaReductionKernel< Operation, 512 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case 256:
-         CudaReductionKernel< Operation, 256 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case 128:
-         CudaReductionKernel< Operation, 128 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  64:
-         CudaReductionKernel< Operation,  64 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  32:
-         CudaReductionKernel< Operation,  32 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  16:
-         CudaReductionKernel< Operation,  16 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-     case   8:
-         CudaReductionKernel< Operation,   8 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case   4:
-         CudaReductionKernel< Operation,   4 >
-        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-        break;
-      case   2:
-         CudaReductionKernel< Operation,   2 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case   1:
-         Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
-      default:
-         Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-   }
-   //checkCudaDevice;
-   return gridSize. x;
-}
-#endif
-
 template< typename Operation >
 bool
 reductionOnCudaDevice( Operation& operation,
@@ -151,10 +59,10 @@ reductionOnCudaDevice( Operation& operation,
    {
       RealType hostArray1[ minGPUReductionDataSize ];
       RealType hostArray2[ minGPUReductionDataSize ];
-      if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, size ) )
+      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, size ) )
          return false;
       if( deviceInput2 && !
-          Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
+          ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
          return false;
       result = operation.initialValue();
       for( IndexType i = 0; i < size; i ++ )
@@ -172,11 +80,11 @@ reductionOnCudaDevice( Operation& operation,
     * Reduce the data on the CUDA device.
     */
    ResultType* deviceAux1( 0 );
-   IndexType reducedSize = reduceOnCudaDevice( operation,
-                                               size,
-                                               deviceInput1,
-                                               deviceInput2,
-                                               deviceAux1 );
+   IndexType reducedSize = CudaReductionKernelLauncher( operation,
+                                                        size,
+                                                        deviceInput1,
+                                                        deviceInput2,
+                                                        deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -187,8 +95,8 @@ reductionOnCudaDevice( Operation& operation,
    /***
     * Transfer the reduced data from device to host.
     */
-   ResultType resultArray[ minGPUReductionDataSize ];
-   if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
+   ResultType resultArray[ reducedSize ];
+   if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
       return false;
  
    #ifdef CUDA_REDUCTION_PROFILING
diff --git a/src/TNL/Containers/ArrayOperationsCuda_impl.cpp b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cpp
similarity index 99%
rename from src/TNL/Containers/ArrayOperationsCuda_impl.cpp
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cpp
index 019b4f9ea48237e58237d9b964ce8403f50fee03..33a2aa1e10b4bf43383a3964f35c4bdc60466f32 100644
--- a/src/TNL/Containers/ArrayOperationsCuda_impl.cpp
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cpp
@@ -8,10 +8,11 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 
 namespace TNL {
 namespace Containers {    
+namespace Algorithms {
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
@@ -290,8 +291,6 @@ template bool ArrayOperations< Devices::Cuda >::setMemory< long double, long int
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
-
-
-
diff --git a/src/TNL/Containers/ArrayOperationsCuda_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cu
similarity index 99%
rename from src/TNL/Containers/ArrayOperationsCuda_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cu
index 24e2de2063ed17cadba0834cb76de32cadfa7adb..8690a9256c613c07110c6c25df0186683c9254d0 100644
--- a/src/TNL/Containers/ArrayOperationsCuda_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsCuda_impl.cu
@@ -8,10 +8,11 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 
 namespace TNL {
 namespace Containers {
+namespace Algorithms {
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
@@ -290,5 +291,6 @@ template bool ArrayOperations< Devices::Cuda >::setMemory< long double, long int
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayOperationsHost_impl.cpp b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cpp
similarity index 99%
rename from src/TNL/Containers/ArrayOperationsHost_impl.cpp
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cpp
index a684b3608e20d627a92939822b4d9e0db3228cb8..28b2d6d494f968d00f2856c285ec73b8fba45c26 100644
--- a/src/TNL/Containers/ArrayOperationsHost_impl.cpp
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cpp
@@ -8,10 +8,11 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 
 namespace TNL {
 namespace Containers {    
+namespace Algorithms {
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
@@ -194,5 +195,6 @@ template bool ArrayOperations< Devices::Host >::setMemory< long double, long int
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
-} // namespace TNL
\ No newline at end of file
+} // namespace TNL
diff --git a/src/TNL/Containers/ArrayOperationsHost_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cu
similarity index 99%
rename from src/TNL/Containers/ArrayOperationsHost_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cu
index 2be45e40951538866bbc6998c9a451988eba916a..5c4f0b8789145a3328088d1f596d65b0ce2caee1 100644
--- a/src/TNL/Containers/ArrayOperationsHost_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/ArrayOperationsHost_impl.cu
@@ -8,10 +8,11 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 
 namespace TNL {
 namespace Containers {
+namespace Algorithms {
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
@@ -194,5 +195,6 @@ template bool ArrayOperations< Devices::Host >::setMemory< long double, long int
 
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/CMakeLists.txt b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..8770de6f389c2aa8ed7418c620067f8da053f681
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/CMakeLists.txt
@@ -0,0 +1,45 @@
+SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation )
+set( common_SOURCES
+     ${CURRENT_DIR}/VectorOperationsHost_impl.cpp
+)
+IF( BUILD_CUDA )
+   set( tnl_core_cuda_CUDA__SOURCES
+        ${common_SOURCES}
+        ${CURRENT_DIR}/ArrayOperationsHost_impl.cu
+        ${CURRENT_DIR}/ArrayOperationsCuda_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-sum_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-min_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-max_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-abs-sum_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-abs-min_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-abs-max_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-and_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-or_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-l2-norm_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-lp-norm_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-equalities_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-inequalities_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-scalar-product_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-sum_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-min_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-max_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-abs-sum_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-abs-min_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-abs-max_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-l2-norm_impl.cu
+        ${CURRENT_DIR}/cuda-reduction-diff-lp-norm_impl.cu
+        ${CURRENT_DIR}/cuda-prefix-sum_impl.cu
+        ${CURRENT_DIR}/VectorOperationsCuda_impl.cu
+        PARENT_SCOPE )
+ELSE()
+   set( common_SOURCES
+        ${common_SOURCES}
+        ${CURRENT_DIR}/ArrayOperationsHost_impl.cpp
+        ${CURRENT_DIR}/ArrayOperationsCuda_impl.cpp
+   )
+ENDIF()
+
+set( tnl_core_cuda_SOURCES
+     ${common_SOURCES}
+     ${CURRENT_DIR}/cuda-reduction_impl.cpp
+     PARENT_SCOPE )
diff --git a/src/TNL/Containers/VectorOperationsCuda_impl.cpp b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsCuda_impl.cpp
similarity index 100%
rename from src/TNL/Containers/VectorOperationsCuda_impl.cpp
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsCuda_impl.cpp
diff --git a/src/TNL/Containers/VectorOperationsCuda_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsCuda_impl.cu
similarity index 100%
rename from src/TNL/Containers/VectorOperationsCuda_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsCuda_impl.cu
diff --git a/src/TNL/Containers/VectorOperationsHost_impl.cpp b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsHost_impl.cpp
similarity index 100%
rename from src/TNL/Containers/VectorOperationsHost_impl.cpp
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/VectorOperationsHost_impl.cpp
diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-prefix-sum_impl.cu
similarity index 100%
rename from src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-prefix-sum_impl.cu
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-abs-max_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-max_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-abs-max_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-max_impl.cu
index 631c94c3a6f7e744e68c571a4709d7fa552fcb79..f4569b196c1b0f56af4d10690d88c4ab320bb780 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-abs-max_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-max_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-abs-min_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-min_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-abs-min_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-min_impl.cu
index b85f6375cdfd82370aff64f0e08c62f52908078a..6206cba87118ad2b347c516ca5896f1eb7a0dcb4 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-abs-min_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-min_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-abs-sum_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-sum_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-abs-sum_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-sum_impl.cu
index 134986e088410f8672544bff97f771870c7b26ba..15819cb4b2e111a6304e1e9c3c2a64d6a914c369 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-abs-sum_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-abs-sum_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-and_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-and_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-and_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-and_impl.cu
index 5043f7047e72807b4ab5fd522b2c7e6d15befbba..edb30509c62de2803b8bba24d24f3b973aed4a33 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-and_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-and_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-max_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-max_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-max_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-max_impl.cu
index 86e44fde250f48b98df924ed02b611588f5c82bf..d402b1b490660b58df4c76227867944190779559 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-max_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-max_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-min_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-min_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-min_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-min_impl.cu
index e1776f18ac659d8e0d8110dee50e498b5320010c..f954631a6677013319d9e250fe3a6892cf06abcc 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-min_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-min_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-sum_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-sum_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-sum_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-sum_impl.cu
index 68a8852ab5e64abf82d6eae118fb2b65c6efbfb4..3e87fd7c8ec204376bea0db88ffa85282d792390 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-abs-sum_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-abs-sum_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-l2-norm_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-l2-norm_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-l2-norm_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-l2-norm_impl.cu
index 5ed99ebb73bbe2b5f15fa527f5e0239d90f867e0..c0f23b3102e45b51c754b771573129926efde8e8 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-l2-norm_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-l2-norm_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-lp-norm_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-lp-norm_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-lp-norm_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-lp-norm_impl.cu
index 8ef608d4fff8f881aa92917ee6d2cfca794e75bd..a0d4a00262633dafc8b023e927647fd18fb760dd 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-lp-norm_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-lp-norm_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-max_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-max_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-max_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-max_impl.cu
index 93a5618d17778ea97da9b2be862aef263462b67e..3eaf7558b545ee30d4bbcf7de1394e2c7e357bb7 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-max_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-max_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-min_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-min_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-min_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-min_impl.cu
index aed84b9e0cca33ca2be05149750e629602bdd885..9e0a1b447f1e54889f72aca3008c41a626b911ca 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-min_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-min_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-diff-sum_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-sum_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-diff-sum_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-sum_impl.cu
index a7a2276d278305588080ab2004cbcc37385945f8..cbf0958556eb1d6c0b50654a81ecdc17f1c47650 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-diff-sum_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-diff-sum_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-equalities_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-equalities_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-equalities_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-equalities_impl.cu
index 233f3579c062b12363f47e94fb88bd8118822e3b..7b7c322b7e51e54e9ae3c4826391dff661ec3456 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-equalities_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-equalities_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-inequalities_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-inequalities_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-inequalities_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-inequalities_impl.cu
index 9a0e49d0c54b0acf277e87a4d4a1c5e4a9f42dba..08ca8d8bdc421c345d17671d2ea27829de080b82 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-inequalities_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-inequalities_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-l2-norm_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-l2-norm_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-l2-norm_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-l2-norm_impl.cu
index 9a6f3d4da1e56f297855379f0b6153e2d8fe07e2..5169e1a2adc8eed4422221887f663dc15f5d612d 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-l2-norm_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-l2-norm_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-lp-norm_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-lp-norm_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-lp-norm_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-lp-norm_impl.cu
index 003668f9d40980406cb02d349f1a31a50eb8fc35..3d5366013a114a5e5dfe2ec317bfc0015485019c 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-lp-norm_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-lp-norm_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-max_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-max_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-max_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-max_impl.cu
index bfc541589615adc175af670426eb55063e7bbf1c..a2965136d9816f4ad4ba3b5eaf1d29a9c49b7d82 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-max_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-max_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-min_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-min_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-min_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-min_impl.cu
index 9c047fd8f5c2fc3d9f9dad1cb618b0f89db2a27c..2434189c4374574eff2a0a51ad5d8f50da2b833b 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-min_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-min_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-or_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-or_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-or_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-or_impl.cu
index e90d60b206b0d607dd836695361df8217d9c7a31..6e2c9849ec896138b4c8cf106a8922cd2437c8a9 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-or_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-or_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-scalar-product_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-scalar-product_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-scalar-product_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-scalar-product_impl.cu
index 9fcc082fff3caeab8ce223c9b52dcd17b51dad3b..eabb3aff6c912faa88b45727c0082dac0538afea 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-scalar-product_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-scalar-product_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction-sum_impl.cu b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-sum_impl.cu
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction-sum_impl.cu
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-sum_impl.cu
index 252f6f8fe3bc1c04cbb1e28c0c9a13a708d3274c..79d9263ab4922d91caf08493653a8b1effdc1cf6 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction-sum_impl.cu
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction-sum_impl.cu
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
  
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
  
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/cuda-reduction_impl.cpp b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction_impl.cpp
similarity index 99%
rename from src/TNL/Containers/Algorithms/cuda-reduction_impl.cpp
rename to src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction_impl.cpp
index b3dbbf719ae323edabc08f8cd6b5d00a8a71d51b..ce76fd397eea1d50f6fb873038f0a1223efd72e4 100644
--- a/src/TNL/Containers/Algorithms/cuda-reduction_impl.cpp
+++ b/src/TNL/Containers/Algorithms/TemplateExplicitInstantiation/cuda-reduction_impl.cpp
@@ -9,7 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Containers/Algorithms/reduction-operations.h>
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h
similarity index 87%
rename from src/TNL/Containers/VectorOperations.h
rename to src/TNL/Containers/Algorithms/VectorOperations.h
index bb8de59506ca28bce639486280fdfb09e1ccbf00..fa490959f475fbe8a801fa33ef18767eb13d498d 100644
--- a/src/TNL/Containers/VectorOperations.h
+++ b/src/TNL/Containers/Algorithms/VectorOperations.h
@@ -10,13 +10,14 @@
 
 #pragma once 
 
-#include <TNL/Containers/Algorithms/cuda-reduction.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
 #include <TNL/Containers/Algorithms/reduction-operations.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 template< typename Device >
 class VectorOperations{};
@@ -64,36 +65,36 @@ class VectorOperations< Devices::Host >
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceMax( const Vector1& v1,
-                                                               const Vector2& v2 );
+                                                             const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceMin( const Vector1& v1,
-                                                               const Vector2& v2 );
+                                                             const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceAbsMax( const Vector1& v1,
-                                                                  const Vector2& v2 );
+                                                                const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceAbsMin( const Vector1& v1,
-                                                                  const Vector2& v2 );
+                                                                const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceL1Norm( const Vector1& v1,
-                                                           const Vector2& v2 );
+                                                                const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceL2Norm( const Vector1& v1,
-                                                           const Vector2& v2 );
+                                                                const Vector2& v2 );
  
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceLpNorm( const Vector1& v1,
-                                                           const Vector2& v2,
-                                                           const typename Vector1::RealType& p );
+                                                                const Vector2& v2,
+                                                                const typename Vector1::RealType& p );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceSum( const Vector1& v1,
-                                                               const Vector2& v2 );
+                                                             const Vector2& v2 );
  
  
    template< typename Vector >
@@ -102,7 +103,7 @@ class VectorOperations< Devices::Host >
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getScalarProduct( const Vector1& v1,
-                                                         const Vector2& v2 );
+                                                       const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static void addVector( Vector1& y,
@@ -127,7 +128,6 @@ class VectorOperations< Devices::Host >
    static void computeExclusivePrefixSum( Vector& v,
                                           const typename Vector::IndexType begin,
                                           const typename Vector::IndexType end );
-
 };
 
 template<>
@@ -177,11 +177,11 @@ class VectorOperations< Devices::Cuda >
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceMin( const Vector1& v1,
-                                                               const Vector2& v2 );
+                                                             const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceAbsMax( const Vector1& v1,
-                                                                  const Vector2& v2 );
+                                                                const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceAbsMin( const Vector1& v1,
@@ -197,12 +197,12 @@ class VectorOperations< Devices::Cuda >
  
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceLpNorm( const Vector1& v1,
-                                                           const Vector2& v2,
-                                                           const typename Vector1::RealType& p );
+                                                                const Vector2& v2,
+                                                                const typename Vector1::RealType& p );
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getVectorDifferenceSum( const Vector1& v1,
-                                                               const Vector2& v2 );
+                                                             const Vector2& v2 );
  
    template< typename Vector >
    static void vectorScalarMultiplication( Vector& v,
@@ -210,7 +210,7 @@ class VectorOperations< Devices::Cuda >
 
    template< typename Vector1, typename Vector2 >
    static typename Vector1::RealType getScalarProduct( const Vector1& v1,
-                                                         const Vector2& v2 );
+                                                       const Vector2& v2 );
 
    template< typename Vector1, typename Vector2 >
    static void addVector( Vector1& y,
@@ -238,9 +238,9 @@ class VectorOperations< Devices::Cuda >
                                           const typename Vector::IndexType end );
 };
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
-#include <TNL/Containers/VectorOperationsHost_impl.h>
-#include <TNL/Containers/VectorOperationsCuda_impl.h>
-
+#include <TNL/Containers/Algorithms/VectorOperationsHost_impl.h>
+#include <TNL/Containers/Algorithms/VectorOperationsCuda_impl.h>
diff --git a/src/TNL/Containers/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
similarity index 72%
rename from src/TNL/Containers/VectorOperationsCuda_impl.h
rename to src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
index 3c6774613ef6448be062dcc9c899fcffeb9ad71e..5d4b29379d62a609752d1d2cdedbdaf0b1ba662b 100644
--- a/src/TNL/Containers/VectorOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
@@ -11,96 +11,110 @@
 #pragma once
 
 #include <TNL/tnlConfig.h>
+#include <TNL/Containers/Algorithms/VectorOperations.h>
 #include <TNL/Containers/Algorithms/cuda-prefix-sum.h>
 #include <TNL/Containers/Algorithms/CublasWrapper.h>
 
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 template< typename Vector >
-void VectorOperations< Devices::Cuda >::addElement( Vector& v,
-                                                 const typename Vector::IndexType i,
-                                                 const typename Vector::RealType& value )
+void
+VectorOperations< Devices::Cuda >::
+addElement( Vector& v,
+            const typename Vector::IndexType i,
+            const typename Vector::RealType& value )
 {
    v[ i ] += value;
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Cuda >::addElement( Vector& v,
-                                                 const typename Vector::IndexType i,
-                                                 const typename Vector::RealType& value,
-                                                 const typename Vector::RealType& thisElementMultiplicator )
+void
+VectorOperations< Devices::Cuda >::
+addElement( Vector& v,
+            const typename Vector::IndexType i,
+            const typename Vector::RealType& value,
+            const typename Vector::RealType& thisElementMultiplicator )
 {
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Cuda > :: getVectorMax( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Cuda >::
+getVectorMax( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionMax< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Cuda > :: getVectorMin( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Cuda >::
+getVectorMin( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionMin< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Cuda > :: getVectorAbsMax( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Cuda >::
+getVectorAbsMax( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionAbsMax< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Cuda > :: getVectorAbsMin( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Cuda >::
+getVectorAbsMin( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionAbsMin< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
@@ -111,16 +125,16 @@ typename Vector::RealType
 VectorOperations< Devices::Cuda >::
 getVectorL1Norm( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionAbsSum< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
@@ -131,16 +145,16 @@ typename Vector::RealType
 VectorOperations< Devices::Cuda >::
 getVectorL2Norm( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionL2Norm< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return std::sqrt( result );
@@ -153,11 +167,11 @@ VectorOperations< Devices::Cuda >::
 getVectorLpNorm( const Vector& v,
                  const typename Vector::RealType& p )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
-   Assert( p > 0.0,
+   TNL_ASSERT( v.getSize() > 0, );
+   TNL_ASSERT( p > 0.0,
               std::cerr << " p = " << p );
  
    if( p == 1 )
@@ -166,110 +180,120 @@ getVectorLpNorm( const Vector& v,
       return getVectorL2Norm( v );
    Real result( 0 );
    Algorithms::tnlParallelReductionLpNorm< Real, Index > operation;
-   operation. setPower( p );
+   operation.setPower( p );
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return std::pow( result, 1.0 / p );
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Cuda > :: getVectorSum( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Cuda >::
+getVectorSum( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionSum< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v. getSize(),
-                          v. getData(),
+                          v.getSize(),
+                          v.getData(),
                           ( Real* ) 0,
                           result );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getVectorDifferenceMax( const Vector1& v1,
-                                                            const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getVectorDifferenceMax( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffMax< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getVectorDifferenceMin( const Vector1& v1,
-                                                            const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getVectorDifferenceMin( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffMin< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
 
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getVectorDifferenceAbsMax( const Vector1& v1,
-                                                               const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getVectorDifferenceAbsMax( const Vector1& v1,
+                           const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffAbsMax< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getVectorDifferenceAbsMin( const Vector1& v1,
-                                                            const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getVectorDifferenceAbsMin( const Vector1& v1,
+                           const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffAbsMin< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
@@ -280,18 +304,18 @@ VectorOperations< Devices::Cuda >::
 getVectorDifferenceL1Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffAbsSum< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
@@ -302,18 +326,18 @@ VectorOperations< Devices::Cuda >::
 getVectorDifferenceL2Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffL2Norm< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return ::sqrt( result );
 }
@@ -324,55 +348,58 @@ typename Vector1::RealType
 VectorOperations< Devices::Cuda >::
 getVectorDifferenceLpNorm( const Vector1& v1,
                            const Vector2& v2,
-                           const typename Vector1 :: RealType& p )
+                           const typename Vector1::RealType& p )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( p > 0.0,
+   TNL_ASSERT( p > 0.0,
               std::cerr << " p = " << p );
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffLpNorm< Real, Index > operation;
    operation.setPower( p );
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return ::pow( result, 1.0 / p );
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getVectorDifferenceSum( const Vector1& v1,
-                                                         const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getVectorDifferenceSum( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
    Algorithms::tnlParallelReductionDiffSum< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
 
 #ifdef HAVE_CUDA
 template< typename Real, typename Index >
-__global__ void vectorScalarMultiplicationCudaKernel( Real* data,
-                                                      Index size,
-                                                      Real alpha )
+__global__ void
+vectorScalarMultiplicationCudaKernel( Real* data,
+                                      Index size,
+                                      Real alpha )
 {
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
+   Index elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+   const Index maxGridSize = blockDim.x * gridDim.x;
    while( elementIdx < size )
    {
       data[ elementIdx ] *= alpha;
@@ -382,20 +409,22 @@ __global__ void vectorScalarMultiplicationCudaKernel( Real* data,
 #endif
 
 template< typename Vector >
-void VectorOperations< Devices::Cuda > :: vectorScalarMultiplication( Vector& v,
-                                                                   const typename Vector::RealType& alpha )
+void
+VectorOperations< Devices::Cuda >::
+vectorScalarMultiplication( Vector& v,
+                            const typename Vector::RealType& alpha )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
    #ifdef HAVE_CUDA
       dim3 blockSize( 0 ), gridSize( 0 );
       const Index& size = v.getSize();
-      blockSize. x = 256;
-      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-      gridSize. x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
+      blockSize.x = 256;
+      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize.x );
+      gridSize.x = min( blocksNumber, Devices::Cuda::getMaxGridSize() );
       vectorScalarMultiplicationCudaKernel<<< gridSize, blockSize >>>( v.getData(),
                                                                        size,
                                                                        alpha );
@@ -407,14 +436,16 @@ void VectorOperations< Devices::Cuda > :: vectorScalarMultiplication( Vector& v,
 
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getScalarProduct( const Vector1& v1,
-                                                                                 const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Cuda >::
+getScalarProduct( const Vector1& v1,
+                  const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0 );
 /*#if defined HAVE_CUBLAS && defined HAVE_CUDA
@@ -425,9 +456,9 @@ typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getScalarProdu
 #endif*/
    Algorithms::tnlParallelReductionScalarProduct< Real, Index > operation;
    reductionOnCudaDevice( operation,
-                          v1. getSize(),
-                          v1. getData(),
-                          v2. getData(),
+                          v1.getSize(),
+                          v1.getData(),
+                          v2.getData(),
                           result );
    return result;
 }
@@ -435,14 +466,15 @@ typename Vector1 :: RealType VectorOperations< Devices::Cuda > :: getScalarProdu
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index >
-__global__ void vectorAddVectorCudaKernel( Real* y,
-                                           const Real* x,
-                                           const Index size,
-                                           const Real alpha,
-                                           const Real thisMultiplicator )
+__global__ void
+vectorAddVectorCudaKernel( Real* y,
+                           const Real* x,
+                           const Index size,
+                           const Real alpha,
+                           const Real thisMultiplicator )
 {
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
+   Index elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+   const Index maxGridSize = blockDim.x * gridDim.x;
    if( thisMultiplicator == 1.0 )
       while( elementIdx < size )
       {
@@ -455,24 +487,24 @@ __global__ void vectorAddVectorCudaKernel( Real* y,
          y[ elementIdx ] = thisMultiplicator * y[ elementIdx ] + alpha * x[ elementIdx ];
          elementIdx += maxGridSize;
       }
-
 }
 #endif
 
 template< typename Vector1, typename Vector2 >
-void VectorOperations< Devices::Cuda > :: addVector( Vector1& y,
-                                                  const Vector2& x,
-                                                  const typename Vector2::RealType& alpha,
-                                                  const typename Vector1::RealType& thisMultiplicator )
+void
+VectorOperations< Devices::Cuda >::
+addVector( Vector1& y,
+           const Vector2& x,
+           const typename Vector2::RealType& alpha,
+           const typename Vector1::RealType& thisMultiplicator )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
-
-   Assert( y. getSize() > 0, );
-   Assert( y. getSize() == x. getSize(), );
-   Assert( y.getData() != 0, );
-   Assert( x.getData() != 0, );
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
+   TNL_ASSERT( y.getSize() > 0, );
+   TNL_ASSERT( y.getSize() == x.getSize(), );
+   TNL_ASSERT( y.getData() != 0, );
+   TNL_ASSERT( x.getData() != 0, );
 
    #ifdef HAVE_CUDA
       dim3 blockSize( 0 ), gridSize( 0 );
@@ -496,16 +528,17 @@ void VectorOperations< Devices::Cuda > :: addVector( Vector1& y,
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index >
-__global__ void vectorAddVectorsCudaKernel( Real* v,
-                                            const Real* v1,
-                                            const Real* v2,
-                                            const Index size,
-                                            const Real multiplicator1,
-                                            const Real multiplicator2,
-                                            const Real thisMultiplicator )
+__global__ void
+vectorAddVectorsCudaKernel( Real* v,
+                            const Real* v1,
+                            const Real* v2,
+                            const Index size,
+                            const Real multiplicator1,
+                            const Real multiplicator2,
+                            const Real thisMultiplicator )
 {
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
+   Index elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+   const Index maxGridSize = blockDim.x * gridDim.x;
    if( thisMultiplicator == 1.0 )
       while( elementIdx < size )
       {
@@ -524,7 +557,6 @@ __global__ void vectorAddVectorsCudaKernel( Real* v,
 }
 #endif
 
-
 template< typename Vector1,
           typename Vector2,
           typename Vector3 >
@@ -537,15 +569,15 @@ addVectors( Vector1& v,
             const typename Vector3::RealType& multiplicator2,
             const typename Vector1::RealType& thisMultiplicator )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v.getSize() > 0, );
-   Assert( v.getSize() == v1.getSize(), );
-   Assert( v.getSize() == v2.getSize(), );
-   Assert( v.getData() != 0, );
-   Assert( v1.getData() != 0, );
-   Assert( v2.getData() != 0, );
+   TNL_ASSERT( v.getSize() > 0, );
+   TNL_ASSERT( v.getSize() == v1.getSize(), );
+   TNL_ASSERT( v.getSize() == v2.getSize(), );
+   TNL_ASSERT( v.getData() != 0, );
+   TNL_ASSERT( v1.getData() != 0, );
+   TNL_ASSERT( v2.getData() != 0, );
 
    #ifdef HAVE_CUDA
       dim3 blockSize( 0 ), gridSize( 0 );
@@ -571,50 +603,54 @@ addVectors( Vector1& v,
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Cuda >::computePrefixSum( Vector& v,
-                                                       typename Vector::IndexType begin,
-                                                       typename Vector::IndexType end )
+void
+VectorOperations< Devices::Cuda >::
+computePrefixSum( Vector& v,
+                  typename Vector::IndexType begin,
+                  typename Vector::IndexType end )
 {
    #ifdef HAVE_CUDA
-   typedef Algorithms::tnlParallelReductionSum< typename Vector::RealType,
-                                    typename Vector::IndexType > OperationType;
+   typedef Algorithms::tnlParallelReductionSum< typename Vector::RealType, typename Vector::IndexType > OperationType;
 
    OperationType operation;
    Algorithms::cudaPrefixSum< typename Vector::RealType,
-                  OperationType,
-                  typename Vector::IndexType >( end - begin,
-                                                256,
-                                                &v.getData()[ begin ],
-                                                &v.getData()[ begin ],
-                                                operation,
-                                                Algorithms::inclusivePrefixSum );
+                              OperationType,
+                              typename Vector::IndexType >
+                                 ( end - begin,
+                                   256,
+                                   &v.getData()[ begin ],
+                                   &v.getData()[ begin ],
+                                   operation,
+                                   Algorithms::inclusivePrefixSum );
    #else
       CudaSupportMissingMessage;;
    #endif
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Cuda >::computeExclusivePrefixSum( Vector& v,
-                                                                typename Vector::IndexType begin,
-                                                                typename Vector::IndexType end )
+void
+VectorOperations< Devices::Cuda >::
+computeExclusivePrefixSum( Vector& v,
+                           typename Vector::IndexType begin,
+                           typename Vector::IndexType end )
 {
 #ifdef HAVE_CUDA
-   typedef Algorithms::tnlParallelReductionSum< typename Vector::RealType,
-                                    typename Vector::IndexType > OperationType;
+   typedef Algorithms::tnlParallelReductionSum< typename Vector::RealType, typename Vector::IndexType > OperationType;
 
    OperationType operation;
-
    Algorithms::cudaPrefixSum< typename Vector::RealType,
-                  OperationType,
-                  typename Vector::IndexType >( end - begin,
-                                                256,
-                                                &v.getData()[ begin ],
-                                                &v.getData()[ begin ],
-                                                operation,
-                                                Algorithms::exclusivePrefixSum );
+                              OperationType,
+                              typename Vector::IndexType >
+                                 ( end - begin,
+                                   256,
+                                   &v.getData()[ begin ],
+                                   &v.getData()[ begin ],
+                                   operation,
+                                   Algorithms::exclusivePrefixSum );
 #endif
 }
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
@@ -624,6 +660,7 @@ void VectorOperations< Devices::Cuda >::computeExclusivePrefixSum( Vector& v,
 
 namespace TNL {
 namespace Containers {
+namespace Algorithms {
 
 /****
  * Max
@@ -875,6 +912,7 @@ extern template long double VectorOperations< Devices::Cuda >::getVectorDifferen
 #endif
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
similarity index 75%
rename from src/TNL/Containers/VectorOperationsHost_impl.h
rename to src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
index 102cc083d9db6959297e1c6dc6a3fc72a28c06e8..aab2a84769c1bf745351ec1e141df1eb740f04ce 100644
--- a/src/TNL/Containers/VectorOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
@@ -10,79 +10,114 @@
 
 #pragma once 
 
+#include <TNL/Containers/Algorithms/VectorOperations.h>
+
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
-static const int OpenMPVectorOperationsThreshold = 65536; // TODO: check this threshold
+static const int OpenMPVectorOperationsThreshold = 512; // TODO: check this threshold
 static const int PrefetchDistance = 128;
 
 template< typename Vector >
-void VectorOperations< Devices::Host >::addElement( Vector& v,
-                                                 const typename Vector::IndexType i,
-                                                 const typename Vector::RealType& value )
+void
+VectorOperations< Devices::Host >::
+addElement( Vector& v,
+            const typename Vector::IndexType i,
+            const typename Vector::RealType& value )
 {
    v[ i ] += value;
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Host >::addElement( Vector& v,
-                                                 const typename Vector::IndexType i,
-                                                 const typename Vector::RealType& value,
-                                                 const typename Vector::RealType& thisElementMultiplicator )
+void
+VectorOperations< Devices::Host >::
+addElement( Vector& v,
+            const typename Vector::IndexType i,
+            const typename Vector::RealType& value,
+            const typename Vector::RealType& thisElementMultiplicator )
 {
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
 template< typename Vector >
-typename Vector::RealType VectorOperations< Devices::Host >::getVectorMax( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Host >::
+getVectorMax( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
-   Real result = v. getElement( 0 );
-   const Index n = v. getSize();
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
+
+   Real result = v.getElement( 0 );
+   const Index n = v.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result = max( result, v. getElement( i ) );
+      result = max( result, v.getElement( i ) );
    return result;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Host > :: getVectorMin( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Host >::
+getVectorMin( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
-   Real result = v. getElement( 0 );
-   const Index n = v. getSize();
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
+
+   Real result = v.getElement( 0 );
+   const Index n = v.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result = min( result, v. getElement( i ) );
+      result = min( result, v.getElement( i ) );
    return result;
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Host > :: getVectorAbsMax( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Host >::
+getVectorAbsMax( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
-   Real result = std::fabs( v. getElement( 0 ) );
-   const Index n = v. getSize();
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
+
+   Real result = std::fabs( v.getElement( 0 ) );
+   const Index n = v.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result = max( result, ( Real ) std::fabs( v. getElement( i ) ) );
+      result = max( result, ( Real ) std::fabs( v.getElement( i ) ) );
    return result;
 }
 
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Host > :: getVectorAbsMin( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Host >::
+getVectorAbsMin( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
-   Real result = std::fabs( v. getElement( 0 ) );
-   const Index n = v. getSize();
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
+
+   Real result = std::fabs( v.getElement( 0 ) );
+   const Index n = v.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result = min( result, ( Real ) std::fabs( v. getElement( i ) ) );
+      result = min( result, ( Real ) std::fabs( v.getElement( i ) ) );
    return result;
 }
 
@@ -91,12 +126,13 @@ typename Vector::RealType
 VectorOperations< Devices::Host >::
 getVectorL1Norm( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0.0 );
-   const Index n = v. getSize();
+   const Index n = v.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -110,11 +146,12 @@ typename Vector::RealType
 VectorOperations< Devices::Host >::
 getVectorL2Norm( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
 
-   Assert( v. getSize() > 0, );
-   const Index n = v. getSize();
+   const Index n = v.getSize();
 
 #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
 #ifdef __GNUC__
@@ -170,20 +207,22 @@ template< typename Vector >
 typename Vector::RealType
 VectorOperations< Devices::Host >::
 getVectorLpNorm( const Vector& v,
-                 const typename Vector :: RealType& p )
+                 const typename Vector::RealType& p )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
-   Assert( p > 0.0,
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
+   TNL_ASSERT( p > 0.0,
               std::cerr << " p = " << p );
+
    if( p == 1.0 )
       return getVectorL1Norm( v );
    if( p == 2.0 )
       return getVectorL2Norm( v );
 
    Real result( 0.0 );
-   const Index n = v. getSize();
+   const Index n = v.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -193,14 +232,17 @@ getVectorLpNorm( const Vector& v,
 }
 
 template< typename Vector >
-typename Vector :: RealType VectorOperations< Devices::Host > :: getVectorSum( const Vector& v )
+typename Vector::RealType
+VectorOperations< Devices::Host >::
+getVectorSum( const Vector& v )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
-   Assert( v. getSize() > 0, );
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
+
+   TNL_ASSERT( v.getSize() > 0, );
 
    Real result( 0.0 );
-   const Index n = v. getSize();
+   const Index n = v.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -210,66 +252,88 @@ typename Vector :: RealType VectorOperations< Devices::Host > :: getVectorSum( c
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Host > :: getVectorDifferenceMax( const Vector1& v1,
-                                                                                       const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getVectorDifferenceMax( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
-   Real result = v1. getElement( 0 ) - v2. getElement( 0 );
-   const Index n = v1. getSize();
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
+
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
+
+   Real result = v1.getElement( 0 ) - v2.getElement( 0 );
+   const Index n = v1.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result =  max( result, v1. getElement( i ) - v2. getElement( i ) );
+      result =  max( result, v1.getElement( i ) - v2.getElement( i ) );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Host > :: getVectorDifferenceMin( const Vector1& v1,
-                                                                                       const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getVectorDifferenceMin( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
-   Real result = v1. getElement( 0 ) - v2. getElement( 0 );
-   const Index n = v1. getSize();
+   Real result = v1.getElement( 0 ) - v2.getElement( 0 );
+   const Index n = v1.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result =  min( result, v1. getElement( i ) - v2. getElement( i ) );
+      result =  min( result, v1.getElement( i ) - v2.getElement( i ) );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Host > :: getVectorDifferenceAbsMax( const Vector1& v1,
-                                                                                          const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getVectorDifferenceAbsMax( const Vector1& v1,
+                           const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
-   Real result = std::fabs( v1. getElement( 0 ) - v2. getElement( 0 ) );
-   const Index n = v1. getSize();
+   Real result = std::fabs( v1.getElement( 0 ) - v2.getElement( 0 ) );
+   const Index n = v1.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(max:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
-      result =  max( result, ( Real ) std::fabs( v1. getElement( i ) - v2. getElement( i ) ) );
+      result =  max( result, ( Real ) std::fabs( v1.getElement( i ) - v2.getElement( i ) ) );
    return result;
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Host > :: getVectorDifferenceAbsMin( const Vector1& v1,
-                                                                                          const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getVectorDifferenceAbsMin( const Vector1& v1,
+                           const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result = std::fabs( v1[ 0 ] - v2[ 0 ] );
-   const Index n = v1. getSize();
+   const Index n = v1.getSize();
+#if defined( HAVE_OPENMP ) && _OPENMP >= 201107  // OpenMP 3.1 added support for min/max reduction operations
+#pragma omp parallel for reduction(min:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif
    for( Index i = 1; i < n; i ++ )
       result =  min( result, ( Real ) std::fabs( v1[ i ] - v2[ i ] ) );
    return result;
@@ -281,14 +345,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceL1Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0.0 );
-   const Index n = v1. getSize();
+   const Index n = v1.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -303,14 +367,14 @@ VectorOperations< Devices::Host >::
 getVectorDifferenceL2Norm( const Vector1& v1,
                            const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0.0 );
-   const Index n = v1. getSize();
+   const Index n = v1.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -330,13 +394,13 @@ getVectorDifferenceLpNorm( const Vector1& v1,
                            const Vector2& v2,
                            const typename Vector1::RealType& p )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( p > 0.0,
+   TNL_ASSERT( p > 0.0,
               std::cerr << " p = " << p );
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    if( p == 1.0 )
       return getVectorDifferenceL1Norm( v1, v2 );
@@ -344,46 +408,50 @@ getVectorDifferenceLpNorm( const Vector1& v1,
       return getVectorDifferenceL2Norm( v1, v2 );
 
    Real result( 0.0 );
-   const Index n = v1. getSize();
+   const Index n = v1.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
    for( Index i = 0; i < n; i ++ )
-      result += std::pow( std::fabs( v1. getElement( i ) - v2. getElement( i ) ), p );
+      result += std::pow( std::fabs( v1.getElement( i ) - v2.getElement( i ) ), p );
    return std::pow( result, 1.0 / p );
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1::RealType VectorOperations< Devices::Host > :: getVectorDifferenceSum( const Vector1& v1,
-                                                                                     const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getVectorDifferenceSum( const Vector1& v1,
+                        const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
 
    Real result( 0.0 );
-   const Index n = v1. getSize();
+   const Index n = v1.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() &&n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
    for( Index i = 0; i < n; i ++ )
-      result += v1. getElement( i ) - v2. getElement( i );
+      result += v1.getElement( i ) - v2.getElement( i );
    return result;
 }
 
 
 template< typename Vector >
-void VectorOperations< Devices::Host > :: vectorScalarMultiplication( Vector& v,
-                                                                   const typename Vector :: RealType& alpha )
+void
+VectorOperations< Devices::Host >::
+vectorScalarMultiplication( Vector& v,
+                            const typename Vector::RealType& alpha )
 {
-   typedef typename Vector :: RealType Real;
-   typedef typename Vector :: IndexType Index;
+   typedef typename Vector::RealType Real;
+   typedef typename Vector::IndexType Index;
 
-   Assert( v. getSize() > 0, );
+   TNL_ASSERT( v.getSize() > 0, );
 
-   const Index n = v. getSize();
+   const Index n = v.getSize();
 #ifdef HAVE_OPENMP
 #pragma omp parallel for if( n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
 #endif
@@ -393,15 +461,17 @@ void VectorOperations< Devices::Host > :: vectorScalarMultiplication( Vector& v,
 
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType VectorOperations< Devices::Host > :: getScalarProduct( const Vector1& v1,
-                                                                                 const Vector2& v2 )
+typename Vector1::RealType
+VectorOperations< Devices::Host >::
+getScalarProduct( const Vector1& v1,
+                  const Vector2& v2 )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v1. getSize() > 0, );
-   Assert( v1. getSize() == v2. getSize(), );
-   const Index n = v1. getSize();
+   TNL_ASSERT( v1.getSize() > 0, );
+   TNL_ASSERT( v1.getSize() == v2.getSize(), );
+   const Index n = v1.getSize();
 
 #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
 #ifdef __GNUC__
@@ -453,17 +523,20 @@ typename Vector1 :: RealType VectorOperations< Devices::Host > :: getScalarProdu
 }
 
 template< typename Vector1, typename Vector2 >
-void VectorOperations< Devices::Host > :: addVector( Vector1& y,
-                                                  const Vector2& x,
-                                                  const typename Vector2::RealType& alpha,
-                                                  const typename Vector1::RealType& thisMultiplicator )
+void
+VectorOperations< Devices::Host >::
+addVector( Vector1& y,
+           const Vector2& x,
+           const typename Vector2::RealType& alpha,
+           const typename Vector1::RealType& thisMultiplicator )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( x. getSize() > 0, );
-   Assert( x. getSize() == y. getSize(), );
-   const Index n = y. getSize();
+   TNL_ASSERT( x.getSize() > 0, );
+   TNL_ASSERT( x.getSize() == y.getSize(), );
+
+   const Index n = y.getSize();
 
 #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
 #ifdef __GNUC__
@@ -527,12 +600,12 @@ addVectors( Vector1& v,
             const typename Vector3::RealType& multiplicator2,
             const typename Vector1::RealType& thisMultiplicator )
 {
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
+   typedef typename Vector1::RealType Real;
+   typedef typename Vector1::IndexType Index;
 
-   Assert( v.getSize() > 0, );
-   Assert( v.getSize() == v1.getSize(), );
-   Assert( v.getSize() == v2.getSize(), );
+   TNL_ASSERT( v.getSize() > 0, );
+   TNL_ASSERT( v.getSize() == v1.getSize(), );
+   TNL_ASSERT( v.getSize() == v2.getSize(), );
  
    const Index n = v.getSize();
    if( thisMultiplicator == 1.0 )
@@ -550,22 +623,28 @@ addVectors( Vector1& v,
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Host >::computePrefixSum( Vector& v,
-                                                       typename Vector::IndexType begin,
-                                                       typename Vector::IndexType end )
+void
+VectorOperations< Devices::Host >::
+computePrefixSum( Vector& v,
+                  typename Vector::IndexType begin,
+                  typename Vector::IndexType end )
 {
    typedef typename Vector::IndexType Index;
+
    for( Index i = begin + 1; i < end; i++ )
       v[ i ] += v[ i - 1 ];
 }
 
 template< typename Vector >
-void VectorOperations< Devices::Host >::computeExclusivePrefixSum( Vector& v,
-                                                                typename Vector::IndexType begin,
-                                                                typename Vector::IndexType end )
+void
+VectorOperations< Devices::Host >::
+computeExclusivePrefixSum( Vector& v,
+                           typename Vector::IndexType begin,
+                           typename Vector::IndexType end )
 {
    typedef typename Vector::IndexType Index;
    typedef typename Vector::RealType Real;
+
    Real aux( v[ begin ] );
    v[ begin ] = 0.0;
    for( Index i = begin + 1; i < end; i++ )
@@ -576,8 +655,9 @@ void VectorOperations< Devices::Host >::computeExclusivePrefixSum( Vector& v,
    }
 }
 
+} // namespace Algorithms
 } // namespace Containers
-} //namespace TNL
+} // namespace TNL
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
@@ -585,6 +665,7 @@ void VectorOperations< Devices::Host >::computeExclusivePrefixSum( Vector& v,
 
 namespace TNL {
 namespace Containers {   
+namespace Algorithms {
 
 /****
  * Max
@@ -837,8 +918,8 @@ extern template long double VectorOperations< Devices::Host >::getVectorDifferen
 #endif
 #endif
 
+} // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
-#endif
-
 
+#endif
diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
index d0591050d556aaca779ee2e861c6c6166b9d7346..971544a024454ef488db269cbddd3b4c755d055f 100644
--- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
+++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h
@@ -31,7 +31,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
                                               DataType* output,
                                               DataType* auxArray )
 {
-   DataType* sharedData = TNL::Devices::getSharedMemory< DataType >();
+   DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >();
    DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
    DataType* warpSums = &auxData[ blockDim. x ];
 
diff --git a/src/TNL/Containers/Algorithms/reduction-operations.h b/src/TNL/Containers/Algorithms/reduction-operations.h
index deec09a3c55a19f1bfde4aa03514be371b4c36d8..8d5499448457651789e122261c62fe03fa959cc8 100644
--- a/src/TNL/Containers/Algorithms/reduction-operations.h
+++ b/src/TNL/Containers/Algorithms/reduction-operations.h
@@ -76,7 +76,7 @@ class tnlParallelReductionMin
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return min( current, data1[ idx ] );
+      return TNL::min( current, data1[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return MaxValue< ResultType>(); };
@@ -117,7 +117,7 @@ class tnlParallelReductionMax
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return max( current, data1[ idx ] );
+      return TNL::max( current, data1[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return MinValue< ResultType>(); };
@@ -241,7 +241,7 @@ class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index >
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return current + ::abs( data1[ idx ] );
+      return current + TNL::abs( data1[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -270,7 +270,7 @@ class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index >
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return min( current, abs( data1[ idx ] ) );
+      return TNL::min( current, TNL::abs( data1[ idx ] ) );
    };
 
    __cuda_callable__ ResultType initialValue() { return MaxValue< ResultType>(); };
@@ -299,7 +299,7 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return std::max( current, ::abs( data1[ idx ] ) );
+      return std::max( current, TNL::abs( data1[ idx ] ) );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -365,7 +365,7 @@ class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index >
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return current + ::pow( ::abs( data1[ idx ] ), p );
+      return current + TNL::pow( TNL::abs( data1[ idx ] ), p );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -514,7 +514,7 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return min( current, data1[ idx ] - data2[ idx ] );
+      return TNL::min( current, data1[ idx ] - data2[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return MaxValue< ResultType>(); };
@@ -543,7 +543,7 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return max( current, data1[ idx ] - data2[ idx ] );
+      return TNL::max( current, data1[ idx ] - data2[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -572,7 +572,7 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return current + abs( data1[ idx ] - data2[ idx ] );
+      return current + TNL::abs( data1[ idx ] - data2[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -601,7 +601,7 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return min( current, abs( data1[ idx ] - data2[ idx ] ) );
+      return TNL::min( current, TNL::abs( data1[ idx ] - data2[ idx ] ) );
    };
 
    __cuda_callable__ ResultType initialValue() { return MaxValue< ResultType>(); };
@@ -630,7 +630,7 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return max( current, abs( data1[ idx ] - data2[ idx ] ) );
+      return TNL::max( current, TNL::abs( data1[ idx ] - data2[ idx ] ) );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
@@ -699,7 +699,7 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind
                             const RealType* data1,
                             const RealType* data2 )
    {
-      return current + ::pow( abs( data1[ idx ] - data2[ idx ] ), p );
+      return current + TNL::pow( TNL::abs( data1[ idx ] - data2[ idx ] ), p );
    };
 
    __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 8eb769622002c12ac6d5d55d68cc22d4ffeea175..92a3ccd6fd519b50435751e3753218b7060778a4 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -11,22 +11,13 @@
 #pragma once 
 
 #include <TNL/Object.h>
-#include <TNL/Containers/SharedArray.h>
+#include <TNL/File.h>
+#include <TNL/Devices/Host.h>
 
-// Forward declarations
 namespace TNL {
-class File;
-
-namespace Devices {
-   class Host;
-}
-
-
 namespace Containers {
 
-
-template< typename Element, typename Device, typename Index >
-class SharedArray;
+template< int, typename > class StaticArray;
 
 /****
  * Array handles memory allocation and sharing of the same data between more Arrays.
diff --git a/src/TNL/Containers/Array_impl.h b/src/TNL/Containers/Array_impl.h
index df751fcd420a212883ffd3944c72a77eb89f4398..65c2bc58f487460137b3bc4996264552345ab062 100644
--- a/src/TNL/Containers/Array_impl.h
+++ b/src/TNL/Containers/Array_impl.h
@@ -15,7 +15,7 @@
 #include <TNL/File.h>
 #include <TNL/Math.h>
 #include <TNL/param-types.h>
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/ArrayIO.h>
 #include <TNL/Containers/Array.h>
 
@@ -23,8 +23,8 @@ namespace TNL {
 namespace Containers {   
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index >::
 Array()
 : size( 0 ),
@@ -35,8 +35,8 @@ Array()
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index >::
 Array( const IndexType& size )
 : size( 0 ),
@@ -48,11 +48,11 @@ Array( const IndexType& size )
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index >::
 Array( Element* data,
-          const IndexType& size )
+       const IndexType& size )
 : size( size ),
   data( data ),
   allocationPointer( 0 ),
@@ -61,20 +61,20 @@ Array( Element* data,
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index >::
 Array( Array< Element, Device, Index >& array,
-          const IndexType& begin,
-          const IndexType& size )
+       const IndexType& begin,
+       const IndexType& size )
 : size( size ),
   data( &array.getData()[ begin ] ),
   allocationPointer( array.allocationPointer ),
   referenceCounter( 0 )
 {
-   Assert( begin < array.getSize(),
+   TNL_ASSERT( begin < array.getSize(),
               std::cerr << " begin = " << begin << " array.getSize() = " << array.getSize() );
-   Assert( begin + size  < array.getSize(),
+   TNL_ASSERT( begin + size  < array.getSize(),
               std::cerr << " begin = " << begin << " size = " << size <<  " array.getSize() = " << array.getSize() );
    if( ! this->size )
       this->size = array.getSize() - begin;
@@ -106,8 +106,8 @@ getType()
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 String
 Array< Element, Device, Index >::
 getTypeVirtual() const
@@ -116,8 +116,8 @@ getTypeVirtual() const
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 String
 Array< Element, Device, Index >::
 getSerializationType()
@@ -126,8 +126,8 @@ getSerializationType()
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 String
 Array< Element, Device, Index >::
 getSerializationTypeVirtual() const
@@ -136,8 +136,8 @@ getSerializationTypeVirtual() const
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 void
 Array< Element, Device, Index >::
 releaseData() const
@@ -146,14 +146,14 @@ releaseData() const
    {
       if( --*this->referenceCounter == 0 )
       {
-         ArrayOperations< Device >::freeMemory( this->allocationPointer );
+         Algorithms::ArrayOperations< Device >::freeMemory( this->allocationPointer );
          delete this->referenceCounter;
          //std::cerr << "Deallocating reference counter " << this->referenceCounter << std::endl;
       }
    }
    else
       if( allocationPointer )
-         ArrayOperations< Device >::freeMemory( this->allocationPointer );
+         Algorithms::ArrayOperations< Device >::freeMemory( this->allocationPointer );
    this->allocationPointer = 0;
    this->data = 0;
    this->size = 0;
@@ -167,15 +167,15 @@ bool
 Array< Element, Device, Index >::
 setSize( const Index size )
 {
-   Assert( size >= 0,
+   TNL_ASSERT( size >= 0,
               std::cerr << "You try to set size of Array to negative value."
                         << "New size: " << size << std::endl );
    if( this->size == size && allocationPointer && ! referenceCounter ) return true;
    this->releaseData();
-   ArrayOperations< Device >::allocateMemory( this->allocationPointer, size );
+   Algorithms::ArrayOperations< Device >::allocateMemory( this->allocationPointer, size );
    this->data = this->allocationPointer;
    this->size = size;
-   if( ! this->allocationPointer )
+   if( size > 0 && ! this->allocationPointer )
    {
       std::cerr << "I am not able to allocate new array with size "
                 << ( double ) this->size * sizeof( ElementType ) / 1.0e9 << " GB." << std::endl;
@@ -186,22 +186,22 @@ setSize( const Index size )
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
    template< typename ArrayT >
 bool
 Array< Element, Device, Index >::
 setLike( const ArrayT& array )
 {
-   Assert( array. getSize() >= 0,
+   TNL_ASSERT( array. getSize() >= 0,
               std::cerr << "You try to set size of Array to negative value."
                         << "Array size: " << array. getSize() << std::endl );
    return setSize( array.getSize() );
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 void
 Array< Element, Device, Index >::
 bind( Element* data,
@@ -213,8 +213,8 @@ bind( Element* data,
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
    template< typename ArrayT >
 void
 Array< Element, Device, Index >::
@@ -222,10 +222,13 @@ bind( const ArrayT& array,
       const IndexType& begin,
       const IndexType& size )
 {
-   Assert( ( std::is_same< Device, typename ArrayT::DeviceType>::value ), );
-   Assert( begin <= array.getSize(),
+   // all template parameters of Array must match, otherwise binding does not make sense
+   static_assert( std::is_same< Element, typename ArrayT::ElementType >::value, "ElementType of both arrays must be the same." );
+   static_assert( std::is_same< Device, typename ArrayT::DeviceType >::value, "DeviceType of both arrays must be the same." );
+   static_assert( std::is_same< Index, typename ArrayT::IndexType >::value, "IndexType of both arrays must be the same." );
+   TNL_ASSERT( begin <= array.getSize(),
               std::cerr << " begin = " << begin << " array.getSize() = " << array.getSize() );
-   Assert( begin + size  <= array.getSize(),
+   TNL_ASSERT( begin + size  <= array.getSize(),
               std::cerr << " begin = " << begin << " size = " << size <<  " array.getSize() = " << array.getSize() );
  
    this->releaseData();
@@ -252,8 +255,8 @@ bind( const ArrayT& array,
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
    template< int Size >
 void
 Array< Element, Device, Index >::
@@ -300,31 +303,31 @@ getSize() const
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 void
 Array< Element, Device, Index >::
 setElement( const Index& i, const Element& x )
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for setElement method in Array "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
-   return ArrayOperations< Device > :: setMemoryElement( &( this->data[ i ] ), x );
+   return Algorithms::ArrayOperations< Device > :: setMemoryElement( &( this->data[ i ] ), x );
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Element
 Array< Element, Device, Index >::
 getElement( const Index& i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for getElement method in Array "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
-   return ArrayOperations< Device >::getMemoryElement( & ( this->data[ i ] ) );
+   return Algorithms::ArrayOperations< Device >::getMemoryElement( & ( this->data[ i ] ) );
 };
 
 template< typename Element,
@@ -335,7 +338,7 @@ inline Element&
 Array< Element, Device, Index >::
 operator[] ( const Index& i )
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for operator[] in Array "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
@@ -343,14 +346,14 @@ operator[] ( const Index& i )
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 __cuda_callable__
 inline const Element&
 Array< Element, Device, Index >::
 operator[] ( const Index& i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for operator[] in Array "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
@@ -358,43 +361,45 @@ operator[] ( const Index& i ) const
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index >&
 Array< Element, Device, Index >::
 operator = ( const Array< Element, Device, Index >& array )
 {
-   Assert( array. getSize() == this->getSize(),
+   TNL_ASSERT( array. getSize() == this->getSize(),
               std::cerr << "Source size: " << array. getSize() << std::endl
                         << "Target size: " << this->getSize() << std::endl );
-   ArrayOperations< Device > ::
-   template copyMemory< Element,
-                        Element,
-                        Index >
-                       ( this->getData(),
-                         array. getData(),
-                         array. getSize() );
+   if( this->getSize() > 0 )
+      Algorithms::ArrayOperations< Device >::
+         template copyMemory< Element,
+                              Element,
+                              Index >
+                             ( this->getData(),
+                               array. getData(),
+                               array. getSize() );
    return ( *this );
 };
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
    template< typename ArrayT >
 Array< Element, Device, Index >&
 Array< Element, Device, Index >::
 operator = ( const ArrayT& array )
 {
-   Assert( array. getSize() == this->getSize(),
+   TNL_ASSERT( array. getSize() == this->getSize(),
               std::cerr << "Source size: " << array. getSize() << std::endl
                         << "Target size: " << this->getSize() << std::endl );
-   ArrayOperations< Device, typename ArrayT::DeviceType > ::
-    template copyMemory< Element,
-                         typename ArrayT::ElementType,
-                         typename ArrayT::IndexType >
-                       ( this->getData(),
-                         array. getData(),
-                         array. getSize() );
+   if( this->getSize() > 0 )
+      Algorithms::ArrayOperations< Device, typename ArrayT::DeviceType >::
+         template copyMemory< Element,
+                              typename ArrayT::ElementType,
+                              typename ArrayT::IndexType >
+                            ( this->getData(),
+                              array. getData(),
+                              array. getSize() );
    return ( *this );
 };
 
@@ -408,13 +413,15 @@ operator == ( const ArrayT& array ) const
 {
    if( array. getSize() != this->getSize() )
       return false;
-   return ArrayOperations< Device, typename ArrayT::DeviceType > ::
-    template compareMemory< typename ArrayT::ElementType,
-                            Element,
-                            typename ArrayT::IndexType >
-                          ( this->getData(),
-                            array.getData(),
-                            array.getSize() );
+   if( this->getSize() == 0 )
+      return true;
+   return Algorithms::ArrayOperations< Device, typename ArrayT::DeviceType >::
+      template compareMemory< typename ArrayT::ElementType,
+                              Element,
+                              typename ArrayT::IndexType >
+                            ( this->getData(),
+                              array.getData(),
+                              array.getSize() );
 }
 
 template< typename Element,
@@ -432,13 +439,13 @@ template< typename Element,
           typename Index >
 void Array< Element, Device, Index > :: setValue( const Element& e )
 {
-   Assert( this->getData(),);
-   ArrayOperations< Device > :: setMemory( this->getData(), e, this->getSize() );
+   TNL_ASSERT( this->getData(),);
+   Algorithms::ArrayOperations< Device >::setMemory( this->getData(), e, this->getSize() );
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 __cuda_callable__
 const Element* Array< Element, Device, Index > :: getData() const
 {
@@ -446,8 +453,8 @@ const Element* Array< Element, Device, Index > :: getData() const
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 __cuda_callable__
 Element* Array< Element, Device, Index > :: getData()
 {
@@ -455,8 +462,8 @@ Element* Array< Element, Device, Index > :: getData()
 }
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Array< Element, Device, Index > :: operator bool() const
 {
    return data != 0;
@@ -464,8 +471,8 @@ Array< Element, Device, Index > :: operator bool() const
 
 
 template< typename Element,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
    template< typename IndexType2 >
 void Array< Element, Device, Index > :: touch( IndexType2 touches ) const
 {
diff --git a/src/TNL/Containers/CMakeLists.txt b/src/TNL/Containers/CMakeLists.txt
index d67b8f8b673bc08eeccdfb5a827d1ca016da957a..b35fa285d9623457ddd5c45a887428a55161fde1 100755
--- a/src/TNL/Containers/CMakeLists.txt
+++ b/src/TNL/Containers/CMakeLists.txt
@@ -3,12 +3,11 @@ ADD_SUBDIRECTORY( Algorithms )
 set( headers Array.h
              Array_impl.h
              ArrayIO.h
-             ArrayOperations.h
-             ArrayOperationsHost_impl.h
-             ArrayOperationsCuda_impl.h
              DynamicTypeTag.h
              IndexedSet.h
              IndexedSet_impl.h
+             List.h
+             List_impl.h
              MultiArray.h
              MultiArray1D_impl.h
              MultiArray2D_impl.h
@@ -29,9 +28,6 @@ set( headers Array.h
              MultiVector2D_impl.h
              MultiVector3D_impl.h
              MultiVector4D_impl.h
-             VectorOperations.h
-             VectorOperationsHost_impl.h
-             VectorOperationsCuda_impl.h                                      
              SharedVector.h
              SharedVector_impl.h
              StaticVector.h 
@@ -46,7 +42,6 @@ set( common_SOURCES
      ${CURRENT_DIR}/MultiArray_impl.cpp
      ${CURRENT_DIR}/Array_impl.cpp
      ${CURRENT_DIR}/StaticArray_impl.cpp 
-     ${CURRENT_DIR}/VectorOperationsHost_impl.cpp
      ${CURRENT_DIR}/MultiVector_impl.cpp
      ${CURRENT_DIR}/SharedVector_impl.cpp
      ${CURRENT_DIR}/Vector_impl.cpp
@@ -56,26 +51,17 @@ set( common_SOURCES
 IF( BUILD_CUDA )
    set( tnl_containers_CUDA__SOURCES
         ${common_SOURCES}
-        ${CURRENT_DIR}/ArrayOperationsHost_impl.cu
-        ${CURRENT_DIR}/ArrayOperationsCuda_impl.cu
         ${CURRENT_DIR}/Array_impl.cu
         ${CURRENT_DIR}/SharedArray_impl.cu
         ${CURRENT_DIR}/MultiArray_impl.cu
         ${CURRENT_DIR}/StaticArray_impl.cu
-        ${CURRENT_DIR}/VectorOperationsCuda_impl.cu
         ${CURRENT_DIR}/Vector_impl.cu
         ${CURRENT_DIR}/StaticVector_impl.cu 
         PARENT_SCOPE )
-ELSE()
-   set( common_SOURCES
-        ${common_SOURCES}
-        ${CURRENT_DIR}/ArrayOperationsHost_impl.cpp
-        ${CURRENT_DIR}/ArrayOperationsCuda_impl.cpp
- )               
 ENDIF()    
 
 set( tnl_containers_SOURCES     
      ${common_SOURCES}
      PARENT_SCOPE )
                    
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/TNL/Containers )
\ No newline at end of file
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/TNL/Containers )
diff --git a/src/TNL/Containers/ConstSharedArray_impl.h b/src/TNL/Containers/ConstSharedArray_impl.h
index d6a036459116503fd74f0ca505b2b9ca0ea42e44..93b662269f064e45c8dfb5b7024f0f9fbdb27c4f 100644
--- a/src/TNL/Containers/ConstSharedArray_impl.h
+++ b/src/TNL/Containers/ConstSharedArray_impl.h
@@ -13,7 +13,7 @@
 #include <iostream>
 #include <TNL/File.h>
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Math.h>
 #include <TNL/param-types.h>
 
@@ -69,10 +69,10 @@ template< typename Element,
 void tnlConstSharedArray< Element, Device, Index > :: bind( const Element* data,
                                                             const Index size )
 {
-   Assert( size >= 0,
+   TNL_ASSERT( size >= 0,
               std::cerr << "You try to set size of tnlConstSharedArray to negative value."
                         << "New size: " << size << std::endl );
-   Assert( data != 0,
+   TNL_ASSERT( data != 0,
               std::cerr << "You try to use null pointer to data for tnlConstSharedArray." );
 
    this->size = size;
@@ -88,7 +88,7 @@ void tnlConstSharedArray< Element, Device, Index > :: bind( const Array& array,
                                                             IndexType size )
 {
    // TODO: This does not work for static arrays.
-   //tnlStaticAssert( Array::DeviceType::DeviceType == DeviceType::DeviceType,
+   //tnlStaticTNL_ASSERT( Array::DeviceType::DeviceType == DeviceType::DeviceType,
    //                 "Attempt to bind arrays between different devices." );
    this->data = &( array. getData()[ index ] );
    if( ! size )
@@ -130,11 +130,11 @@ template< typename Element,
           typename Index >
 Element tnlConstSharedArray< Element, Device, Index > :: getElement( Index i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for getElement method in tnlConstSharedArray with name "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
-   return ArrayOperations< Device >::getMemoryElement( &( this->data[ i ] ) );
+   return Algorithms::ArrayOperations< Device >::getMemoryElement( &( this->data[ i ] ) );
 };
 
 template< typename Element,
@@ -143,12 +143,12 @@ template< typename Element,
 __cuda_callable__
 const Element& tnlConstSharedArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for operator[] in tnlConstSharedArray with name "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
    // TODO: add static assert - this does not make sense for Devices::CudaDevice
-   return ArrayOperations< Device >::getArrayElementReference( this->data, i );
+   return Algorithms::ArrayOperations< Device >::getArrayElementReference( this->data, i );
 };
 
 template< typename Element,
@@ -179,14 +179,13 @@ bool tnlConstSharedArray< Element, Device, Index > :: operator == ( const Array&
 {
    if( array. getSize() != this->getSize() )
       return false;
-   return ArrayOperations< Device,
-                              typename Array :: DeviceType > ::
-    template compareMemory< typename Array :: ElementType,
-                            Element,
-                            typename Array :: IndexType >
-                          ( this->getData(),
-                            array. getData(),
-                            array. getSize() );
+   return Algorithms::ArrayOperations< Device, typename Array :: DeviceType >::
+      template compareMemory< typename Array :: ElementType,
+                              Element,
+                              typename Array :: IndexType >
+                            ( this->getData(),
+                              array. getData(),
+                              array. getSize() );
 }
 
 template< typename Element,
@@ -229,7 +228,7 @@ template< typename Element,
           typename Index >
 bool tnlConstSharedArray< Element, Device, Index > :: save( File& file ) const
 {
-   Assert( this->size != 0,
+   TNL_ASSERT( this->size != 0,
               std::cerr << "You try to save empty array." );
    if( ! Object :: save( file ) )
       return false;
diff --git a/src/TNL/Containers/IndexedSet_impl.h b/src/TNL/Containers/IndexedSet_impl.h
index 230d29303fbeb1eed17e5ca901253d06dbbd8cee..54c326f65bb24c877fd001c51c8a558cabebc27b 100644
--- a/src/TNL/Containers/IndexedSet_impl.h
+++ b/src/TNL/Containers/IndexedSet_impl.h
@@ -57,7 +57,7 @@ template< typename Element,
    template<typename ArrayType>
 void IndexedSet< Element, Index, Key >::toArray( ArrayType& array ) const
 {
-   Assert( array.getSize() == getSize(),
+   TNL_ASSERT( array.getSize() == getSize(),
               std::cerr << "array.getSize() = " << array.getSize()
                    << " getSize() = " << getSize() );
 
diff --git a/src/TNL/List.h b/src/TNL/Containers/List.h
similarity index 83%
rename from src/TNL/List.h
rename to src/TNL/Containers/List.h
index 9d0f2cdb1a12a58087863576dfcfd1cbd793d117..adb163374dc7a3e9d7bbb626605bbebeb2e0c92d 100644
--- a/src/TNL/List.h
+++ b/src/TNL/Containers/List.h
@@ -10,16 +10,18 @@
 
 #pragma once
 
-#include <TNL/Assert.h>
 #include <stdlib.h>
 #include <iostream>
+
+#include <TNL/Assert.h>
+#include <TNL/File.h>
 #include <TNL/String.h>
 #include <TNL/param-types.h>
 
 namespace TNL {
+namespace Containers {
 
-class File;
-template< class T > class DataElement;
+template< class T > class ListDataElement;
 
 //! Template for double linked lists
 /*! To acces elements in the list one can use method getSize() and
@@ -69,6 +71,10 @@ template< class T > class List
 
       const List& operator = ( const List& lst );
 
+      bool operator == ( const List& lst ) const;
+
+      bool operator != ( const List& lst ) const;
+
       //! Append new data element
       bool Append( const T& data );
 
@@ -114,18 +120,18 @@ template< class T > class List
    protected:
 
       //! Pointer to the first element
-      DataElement< T >* first;
+      ListDataElement< T >* first;
 
       //! Pointer to the last element
       /*! We use pointer to last element while adding new element to keep order of elements
        */
-      DataElement< T >* last;
+      ListDataElement< T >* last;
 
       //! List size
       int size;
 
       //! Iterator
-      mutable DataElement< T >* iterator;
+      mutable ListDataElement< T >* iterator;
 
       //! Iterator index
       mutable int index;
@@ -136,33 +142,33 @@ template< class T > class List
 template< typename T > std::ostream& operator << ( std::ostream& str, const List< T >& list );
 
 //! Data element for List and mStack
-template< class T > class DataElement
+template< class T > class ListDataElement
 {
    //! Main data
    T data;
 
    //! Pointer to the next element
-   DataElement< T >* next;
+   ListDataElement< T >* next;
 
    //! Pointer to the previous element
-   DataElement< T >* previous;
+   ListDataElement< T >* previous;
 
    public:
    //! Basic constructor
-   DataElement()
+   ListDataElement()
       : next( 0 ),
         previous( 0 ){};
 
    //! Constructor with given data and possibly pointer to next element
-   DataElement( const T& dt,
-                   DataElement< T >* prv = 0,
-                   DataElement< T >* nxt = 0 )
+   ListDataElement( const T& dt,
+                    ListDataElement< T >* prv = 0,
+                    ListDataElement< T >* nxt = 0 )
       : data( dt ),
         next( nxt ),
         previous( prv ){};
 
    //! Destructor
-   ~DataElement(){};
+   ~ListDataElement(){};
 
    //! Return data for non-const instances
    T& Data() { return data; };
@@ -171,19 +177,19 @@ template< class T > class DataElement
    const T& Data() const { return data; };
 
    //! Return pointer to the next element for non-const instances
-   DataElement< T >*& Next() { return next; };
+   ListDataElement< T >*& Next() { return next; };
 
    //! Return pointer to the next element for const instances
-   const DataElement< T >* Next() const { return next; };
+   const ListDataElement< T >* Next() const { return next; };
 
    //! Return pointer to the previous element for non-const instances
-   DataElement< T >*& Previous() { return previous; };
+   ListDataElement< T >*& Previous() { return previous; };
 
    //! Return pointer to the previous element for const instances
-   const DataElement< T >* Previous() const { return previous; };
-
+   const ListDataElement< T >* Previous() const { return previous; };
 };
 
+} // namespace Containers
 } // namespace TNL
 
-#include <TNL/List_impl.h>
+#include <TNL/Containers/List_impl.h>
diff --git a/src/TNL/List_impl.h b/src/TNL/Containers/List_impl.h
similarity index 85%
rename from src/TNL/List_impl.h
rename to src/TNL/Containers/List_impl.h
index 389fb1732b63cea1cd95336a7a9ac5bf9f80dc5f..f91f30ca2b7fd7578a109d0786bfcbc93c91a100 100644
--- a/src/TNL/List_impl.h
+++ b/src/TNL/Containers/List_impl.h
@@ -10,9 +10,11 @@
 
 #pragma once
 
-#include <TNL/File.h>
+#include <TNL/Containers/List.h>
+#include <TNL/Math.h>
 
 namespace TNL {
+namespace Containers {
 
 template< typename T >
 List< T >::List()
@@ -54,8 +56,8 @@ int List< T >::getSize() const
 template< typename T >
 T& List< T >::operator[]( const int& ind )
 {
-   Assert( ind < size, );
-   int iter_dist = abs( index - ind );
+   TNL_ASSERT( ind < size, );
+   int iter_dist = TNL::abs( index - ind );
    if( ! iterator ||
        iter_dist > ind ||
        iter_dist > size - ind )
@@ -87,7 +89,7 @@ T& List< T >::operator[]( const int& ind )
          iterator = iterator -> Next();
          index ++;
       }
-      Assert( iterator, );
+      TNL_ASSERT( iterator, );
    }
    return iterator -> Data();
 };
@@ -105,20 +107,37 @@ const List< T >& List< T >::operator = ( const List& lst )
    return( *this );
 }
 
+template< typename T >
+bool List< T >::operator == ( const List& lst ) const
+{
+   if( this->getSize() != lst.getSize() )
+      return false;
+   for( int i = 0; i < this->getSize(); i++ )
+      if( (*this)[ i ] != lst[ i ] )
+         return false;
+   return true;
+}
+
+template< typename T >
+bool List< T >::operator != ( const List& lst ) const
+{
+   return ! operator==( lst );
+}
+
 template< typename T >
 bool List< T >::Append( const T& data )
 {
    if( ! first )
    {
-      Assert( ! last, );
-      first = last = new DataElement< T >( data );
+      TNL_ASSERT( ! last, );
+      first = last = new ListDataElement< T >( data );
       if( ! first ) return false;
    }
    else
    {
-      DataElement< T >* new_element =  new DataElement< T >( data, last, 0 );
+      ListDataElement< T >* new_element =  new ListDataElement< T >( data, last, 0 );
       if( ! new_element ) return false;
-      Assert( last, );
+      TNL_ASSERT( last, );
       last = last -> Next() = new_element;
    }
    size ++;
@@ -130,13 +149,13 @@ bool List< T >::Prepend( const T& data )
 {
    if( ! first )
    {
-      Assert( ! last, );
-      first = last = new DataElement< T >( data );
+      TNL_ASSERT( ! last, );
+      first = last = new ListDataElement< T >( data );
       if( ! first ) return false;
    }
    else
    {
-      DataElement< T >* new_element =  new DataElement< T >( data, 0, first );
+      ListDataElement< T >* new_element =  new ListDataElement< T >( data, 0, first );
       if( ! new_element ) return false;
       first = first -> Previous() = new_element;
    }
@@ -148,12 +167,12 @@ bool List< T >::Prepend( const T& data )
 template< typename T >
 bool List< T >::Insert( const T& data, const int& ind )
 {
-   Assert( ind <= size || ! size, );
+   TNL_ASSERT( ind <= size || ! size, );
    if( ind == 0 ) return Prepend( data );
    if( ind == size ) return Append( data );
    operator[]( ind );
-   DataElement< T >* new_el =
-      new DataElement< T >( data,
+   ListDataElement< T >* new_el =
+      new ListDataElement< T >( data,
                              iterator -> Previous(),
                              iterator );
    if( ! new_el ) return false;
@@ -189,7 +208,7 @@ template< typename T >
    template< typename Array >
 void List< T >::toArray( Array& array )
 {
-   Assert( this->getSize() <= array.getSize(),
+   TNL_ASSERT( this->getSize() <= array.getSize(),
               std::cerr << "this->getSize() = " << this->getSize()
                    << " array.getSize() = " << array.getSize() << std::endl; );
    for( int i = 0; i < this->getSize(); i++ )
@@ -200,7 +219,7 @@ template< typename T >
 void List< T >::Erase( const int& ind )
 {
    operator[]( ind );
-   DataElement< T >* tmp_it = iterator;
+   ListDataElement< T >* tmp_it = iterator;
    if( iterator -> Next() )
       iterator -> Next() -> Previous() = iterator -> Previous();
    if( iterator -> Previous() )
@@ -229,10 +248,10 @@ template< typename T >
 void List< T >::reset()
 {
    iterator = first;
-   DataElement< T >* tmp_it;
+   ListDataElement< T >* tmp_it;
    while( iterator )
    {
-      Assert( iterator, );
+      TNL_ASSERT( iterator, );
       tmp_it = iterator;
       iterator = iterator -> Next();
       delete tmp_it;
@@ -245,7 +264,7 @@ template< typename T >
 void List< T >::DeepEraseAll()
 {
    iterator = first;
-   DataElement< T >* tmp_it;
+   ListDataElement< T >* tmp_it;
    int i( 0 );
    while( iterator )
    {
@@ -381,6 +400,5 @@ std::ostream& operator << ( std::ostream& str, const List< T >& list )
    return str;
 };
 
+} // namespace Containers
 } // namespace TNL
-
-
diff --git a/src/TNL/Containers/MultiArray1D_impl.h b/src/TNL/Containers/MultiArray1D_impl.h
index 3a75633a99e3f37e9fead7af0489bd057738a462..ee57a53f085393c216bdbf2a53993b44322efb05 100644
--- a/src/TNL/Containers/MultiArray1D_impl.h
+++ b/src/TNL/Containers/MultiArray1D_impl.h
@@ -59,7 +59,7 @@ String MultiArray< 1, Element, Device, Index > :: getSerializationTypeVirtual()
 template< typename Element, typename Device, typename Index >
 bool MultiArray< 1, Element, Device, Index > :: setDimensions( const Index iSize )
 {
-   Assert( iSize > 0,
+   TNL_ASSERT( iSize > 0,
               std::cerr << "iSize = " << iSize );
    dimensions[ 0 ] = iSize;
    return Array< Element, Device, Index >::setSize( iSize );
@@ -68,7 +68,7 @@ bool MultiArray< 1, Element, Device, Index > :: setDimensions( const Index iSize
 template< typename Element, typename Device, typename Index >
 bool MultiArray< 1, Element, Device, Index > :: setDimensions( const Containers::StaticVector< 1, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0,
               std::cerr << " dimensions[ 0 ] = " << dimensions[ 0 ] );
    this->dimensions = dimensions;
    return Array< Element, Device, Index >::setSize( this->dimensions[ 0 ] );
@@ -106,7 +106,7 @@ template< typename Element, typename Device, typename Index >
 __cuda_callable__
 Index MultiArray< 1, Element, Device, Index > :: getElementIndex( const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ],
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ],
               std::cerr << "i = " << i << " this->dimensions[ 0 ] = " <<  this->dimensions[ 0 ] );
    return i;
 }
@@ -142,7 +142,7 @@ template< typename Element, typename Device, typename Index >
 bool MultiArray< 1, Element, Device, Index > :: operator == ( const MultiArrayT& array ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to compare two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -161,7 +161,7 @@ MultiArray< 1, Element, Device, Index >&
    MultiArray< 1, Element, Device, Index > :: operator = ( const MultiArray< 1, Element, Device, Index >& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -175,7 +175,7 @@ MultiArray< 1, Element, Device, Index >&
    MultiArray< 1, Element, Device, Index > :: operator = ( const MultiArrayT& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiArray2D_impl.h b/src/TNL/Containers/MultiArray2D_impl.h
index 83dca39ffb41b69887716c12020d0b5b20663753..cf67c102217186283c8d546211036d75251b9d8e 100644
--- a/src/TNL/Containers/MultiArray2D_impl.h
+++ b/src/TNL/Containers/MultiArray2D_impl.h
@@ -60,7 +60,7 @@ template< typename Element, typename Device, typename Index >
 bool MultiArray< 2, Element, Device, Index > :: setDimensions( const Index jSize,
                                                                   const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize );
 
@@ -72,7 +72,7 @@ bool MultiArray< 2, Element, Device, Index > :: setDimensions( const Index jSize
 template< typename Element, typename Device, typename Index >
 bool MultiArray< 2, Element, Device, Index > :: setDimensions( const Containers::StaticVector< 2, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0,
               std::cerr << "dimensions = " << dimensions );
    /****
     * Swap the dimensions in the tuple to be compatible with the previous method.
@@ -115,7 +115,7 @@ template< typename Element, typename Device, typename Index >
 __cuda_callable__
 Index MultiArray< 2, Element, Device, Index > :: getElementIndex( const Index j, const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] && j >= 0 && j < this->dimensions[ 1 ],
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] && j >= 0 && j < this->dimensions[ 1 ],
               std::cerr << "i = " << i << " j = " << j << " this->dimensions[ 0 ] = " <<  this->dimensions[ 0 ]
                    << " this->dimensions[ 1 ] = " << this->dimensions[ 1 ] );
    return j * this->dimensions[ 0 ] + i;
@@ -152,7 +152,7 @@ template< typename Element, typename Device, typename Index >
 bool MultiArray< 2, Element, Device, Index > :: operator == ( const MultiArrayT& array ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to compare two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -171,7 +171,7 @@ MultiArray< 2, Element, Device, Index >&
    MultiArray< 2, Element, Device, Index > :: operator = ( const MultiArray< 2, Element, Device, Index >& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -185,7 +185,7 @@ MultiArray< 2, Element, Device, Index >&
    MultiArray< 2, Element, Device, Index > :: operator = ( const MultiArrayT& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiArray3D_impl.h b/src/TNL/Containers/MultiArray3D_impl.h
index fe7dd87412a8ed493b12b3265836f38da0c4a2d1..633f67a798df8f4e657049e5e236a107da548a3a 100644
--- a/src/TNL/Containers/MultiArray3D_impl.h
+++ b/src/TNL/Containers/MultiArray3D_impl.h
@@ -61,7 +61,7 @@ bool MultiArray< 3, Element, Device, Index > :: setDimensions( const Index kSize
                                                                        const Index jSize,
                                                                        const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0 && kSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0 && kSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize
                    << "kSize = " << kSize );
@@ -75,7 +75,7 @@ bool MultiArray< 3, Element, Device, Index > :: setDimensions( const Index kSize
 template< typename Element, typename Device, typename Index >
 bool MultiArray< 3, Element, Device, Index > :: setDimensions( const Containers::StaticVector< 3, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ],
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ],
               std::cerr << "dimensions = " << dimensions );
    /****
     * Swap the dimensions in the tuple to be compatible with the previous method.
@@ -126,7 +126,7 @@ Index MultiArray< 3, Element, Device, Index > :: getElementIndex( const Index k,
                                                                      const Index j,
                                                                      const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] &&
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] &&
               j >= 0 && j < this->dimensions[ 1 ] &&
               k >= 0 && k < this->dimensions[ 2 ],
               std::cerr << " i = " << i
@@ -176,7 +176,7 @@ template< typename Element, typename Device, typename Index >
 bool MultiArray< 3, Element, Device, Index > :: operator == ( const MultiArrayT& array ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to compare two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -195,7 +195,7 @@ MultiArray< 3, Element, Device, Index >&
    MultiArray< 3, Element, Device, Index > :: operator = ( const MultiArray< 3, Element, Device, Index >& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -209,7 +209,7 @@ MultiArray< 3, Element, Device, Index >&
    MultiArray< 3, Element, Device, Index > :: operator = ( const MultiArrayT& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiArray4D_impl.h b/src/TNL/Containers/MultiArray4D_impl.h
index 7bcf10917dcb53ac5e047f6de0fd75bc872769ee..cb513418a339d9ce812b3f7f849390f0705a43cc 100644
--- a/src/TNL/Containers/MultiArray4D_impl.h
+++ b/src/TNL/Containers/MultiArray4D_impl.h
@@ -63,7 +63,7 @@ bool MultiArray< 4, Element, Device, Index > :: setDimensions( const Index lSize
                                                                        const Index jSize,
                                                                        const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0 && kSize > 0 && lSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0 && kSize > 0 && lSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize
                    << "kSize = " << kSize
@@ -79,7 +79,7 @@ bool MultiArray< 4, Element, Device, Index > :: setDimensions( const Index lSize
 template< typename Element, typename Device, typename Index >
 bool MultiArray< 4, Element, Device, Index > :: setDimensions( const Containers::StaticVector< 4, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ] && dimensions[ 3 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ] && dimensions[ 3 ] > 0,
               std::cerr << "dimensions = " << dimensions );
    /****
     * Swap the dimensions in the tuple to be compatible with the previous method.
@@ -135,7 +135,7 @@ Index MultiArray< 4, Element, Device, Index > :: getElementIndex( const Index l,
                                                                      const Index j,
                                                                      const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] &&
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] &&
               j >= 0 && j < this->dimensions[ 1 ] &&
               k >= 0 && k < this->dimensions[ 2 ] &&
               l >= 0 && l < this->dimensions[ 3 ],
@@ -191,7 +191,7 @@ template< typename Element, typename Device, typename Index >
 bool MultiArray< 4, Element, Device, Index > :: operator == ( const MultiArrayT& array ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to compare two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -210,7 +210,7 @@ MultiArray< 4, Element, Device, Index >&
    MultiArray< 4, Element, Device, Index > :: operator = ( const MultiArray< 4, Element, Device, Index >& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
@@ -224,7 +224,7 @@ MultiArray< 4, Element, Device, Index >&
    MultiArray< 4, Element, Device, Index > :: operator = ( const MultiArrayT& array )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == array. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == array. getDimensions(),
               std::cerr << "You are attempting to assign two arrays with different dimensions." << std::endl
                    << "First array dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second array dimensions are ( " << array. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiVector1D_impl.h b/src/TNL/Containers/MultiVector1D_impl.h
index 3fc35fee6cde0af4599dfe609b33d2bb589a7529..2bbaf6bea36c66b6bebe2d20a02ea4ba1b78492e 100644
--- a/src/TNL/Containers/MultiVector1D_impl.h
+++ b/src/TNL/Containers/MultiVector1D_impl.h
@@ -59,7 +59,7 @@ String MultiVector< 1, Real, Device, Index > :: getSerializationTypeVirtual() co
 template< typename Real, typename Device, typename Index >
 bool MultiVector< 1, Real, Device, Index > :: setDimensions( const Index iSize )
 {
-   Assert( iSize > 0,
+   TNL_ASSERT( iSize > 0,
               std::cerr << "iSize = " << iSize );
    dimensions[ 0 ] = iSize;
    return Vector< Real, Device, Index > :: setSize( iSize );
@@ -68,7 +68,7 @@ bool MultiVector< 1, Real, Device, Index > :: setDimensions( const Index iSize )
 template< typename Real, typename Device, typename Index >
 bool MultiVector< 1, Real, Device, Index > :: setDimensions( const StaticVector< Dimensions, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0,
               std::cerr << " dimensions[ 0 ] = " << dimensions[ 0 ] );
    this->dimensions = dimensions;
    return Vector< Real, Device, Index > :: setSize( this->dimensions[ 0 ] );
@@ -96,7 +96,7 @@ const StaticVector< 1, Index >& MultiVector< 1, Real, Device, Index > :: getDime
 template< typename Real, typename Device, typename Index >
 Index MultiVector< 1, Real, Device, Index > :: getElementIndex( const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ],
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ],
               std::cerr << "i = " << i
                    << "this->dimensions[ 0 ] " << this->dimensions[ 0 ] );
    return i;
@@ -132,7 +132,7 @@ template< typename Real, typename Device, typename Index >
 bool MultiVector< 1, Real, Device, Index > :: operator == ( const MultiVectorT& vector ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to compare two Vectors with different dimensions." << std::endl
                    << "First Vector name dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second Vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -151,7 +151,7 @@ MultiVector< 1, Real, Device, Index >&
    MultiVector< 1, Real, Device, Index > :: operator = ( const MultiVector< 1, Real, Device, Index >& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -165,7 +165,7 @@ MultiVector< 1, Real, Device, Index >&
    MultiVector< 1, Real, Device, Index > :: operator = ( const MultiVectorT& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiVector2D_impl.h b/src/TNL/Containers/MultiVector2D_impl.h
index dd95fdb28ae65eccc65dc70852bc059889255ed6..510ef51b576ef9b5058f9ccad7407f3c3746c480 100644
--- a/src/TNL/Containers/MultiVector2D_impl.h
+++ b/src/TNL/Containers/MultiVector2D_impl.h
@@ -60,7 +60,7 @@ template< typename Real, typename Device, typename Index >
 bool MultiVector< 2, Real, Device, Index > :: setDimensions( const Index jSize,
                                                                        const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize );
 
@@ -72,7 +72,7 @@ bool MultiVector< 2, Real, Device, Index > :: setDimensions( const Index jSize,
 template< typename Real, typename Device, typename Index >
 bool MultiVector< 2, Real, Device, Index > :: setDimensions( const StaticVector< 2, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0,
               std::cerr << "dimensions = " << dimensions );
    this->dimensions = dimensions;
    return Vector< Real, Device, Index > :: setSize( this->dimensions[ 1 ] * this->dimensions[ 0 ] );
@@ -101,7 +101,7 @@ const StaticVector< 2, Index >& MultiVector< 2, Real, Device, Index > :: getDime
 template< typename Real, typename Device, typename Index >
 Index MultiVector< 2, Real, Device, Index > :: getElementIndex( const Index j, const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] && j >= 0 && j < this->dimensions[ 1 ],
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] && j >= 0 && j < this->dimensions[ 1 ],
               std::cerr << "i = " << i
                    << "j = " << j
                    << "this->dimensions[ 0 ] = " << this->dimensions[ 0 ]
@@ -139,7 +139,7 @@ template< typename Real, typename Device, typename Index >
 bool MultiVector< 2, Real, Device, Index > :: operator == ( const MultiVectorT& vector ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to compare two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -158,7 +158,7 @@ MultiVector< 2, Real, Device, Index >&
    MultiVector< 2, Real, Device, Index > :: operator = ( const MultiVector< 2, Real, Device, Index >& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -172,7 +172,7 @@ MultiVector< 2, Real, Device, Index >&
    MultiVector< 2, Real, Device, Index > :: operator = ( const MultiVectorT& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiVector3D_impl.h b/src/TNL/Containers/MultiVector3D_impl.h
index 7d551b923b766ffe27e105c0e58dc707ea832419..3fa6b6dd30fca392a6adc5372fa76b5df37aa535 100644
--- a/src/TNL/Containers/MultiVector3D_impl.h
+++ b/src/TNL/Containers/MultiVector3D_impl.h
@@ -61,7 +61,7 @@ bool MultiVector< 3, Real, Device, Index > :: setDimensions( const Index kSize,
                                                                        const Index jSize,
                                                                        const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0 && kSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0 && kSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize
                    << "kSize = " << kSize );
@@ -75,7 +75,7 @@ bool MultiVector< 3, Real, Device, Index > :: setDimensions( const Index kSize,
 template< typename Real, typename Device, typename Index >
 bool MultiVector< 3, Real, Device, Index > :: setDimensions( const StaticVector< 3, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ],
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ],
               std::cerr << "dimensions = " << dimensions );
    this->dimensions = dimensions;
    return Vector< Real, Device, Index > :: setSize( this->dimensions[ 2 ] *
@@ -111,7 +111,7 @@ Index MultiVector< 3, Real, Device, Index > :: getElementIndex( const Index k,
                                                                      const Index j,
                                                                      const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] &&
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] &&
               j >= 0 && j < this->dimensions[ 1 ] &&
               k >= 0 && k < this->dimensions[ 2 ],
               std::cerr << " i = " << i
@@ -159,7 +159,7 @@ template< typename Real, typename Device, typename Index >
 bool MultiVector< 3, Real, Device, Index > :: operator == ( const MultiVectorT& vector ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to compare two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -178,7 +178,7 @@ MultiVector< 3, Real, Device, Index >&
    MultiVector< 3, Real, Device, Index > :: operator = ( const MultiVector< 3, Real, Device, Index >& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -192,7 +192,7 @@ MultiVector< 3, Real, Device, Index >&
    MultiVector< 3, Real, Device, Index > :: operator = ( const MultiVectorT& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/MultiVector4D_impl.h b/src/TNL/Containers/MultiVector4D_impl.h
index a4ad6262b5e296c6239327c5ea055835aa55a71d..ed69bd59d5075d918317ccad28f23e393f7efd18 100644
--- a/src/TNL/Containers/MultiVector4D_impl.h
+++ b/src/TNL/Containers/MultiVector4D_impl.h
@@ -62,7 +62,7 @@ bool MultiVector< 4, Real, Device, Index > :: setDimensions( const Index lSize,
                                                                        const Index jSize,
                                                                        const Index iSize )
 {
-   Assert( iSize > 0 && jSize > 0 && kSize > 0 && lSize > 0,
+   TNL_ASSERT( iSize > 0 && jSize > 0 && kSize > 0 && lSize > 0,
               std::cerr << "iSize = " << iSize
                    << "jSize = " << jSize
                    << "kSize = " << kSize
@@ -78,7 +78,7 @@ bool MultiVector< 4, Real, Device, Index > :: setDimensions( const Index lSize,
 template< typename Real, typename Device, typename Index >
 bool MultiVector< 4, Real, Device, Index > :: setDimensions( const StaticVector< 4, Index >& dimensions )
 {
-   Assert( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ] && dimensions[ 3 ] > 0,
+   TNL_ASSERT( dimensions[ 0 ] > 0 && dimensions[ 1 ] > 0 && dimensions[ 2 ] && dimensions[ 3 ] > 0,
               std::cerr << "dimensions = " << dimensions );
    this->dimensions = dimensions;
    return Vector< Real, Device, Index > :: setSize( this->dimensions[ 3 ] *
@@ -118,7 +118,7 @@ Index MultiVector< 4, Real, Device, Index > :: getElementIndex( const Index l,
                                                                           const Index j,
                                                                           const Index i ) const
 {
-   Assert( i >= 0 && i < this->dimensions[ 0 ] &&
+   TNL_ASSERT( i >= 0 && i < this->dimensions[ 0 ] &&
               j >= 0 && j < this->dimensions[ 1 ] &&
               k >= 0 && k < this->dimensions[ 2 ] &&
               l >= 0 && l < this->dimensions[ 3 ],
@@ -176,7 +176,7 @@ template< typename Real, typename Device, typename Index >
 bool MultiVector< 4, Real, Device, Index > :: operator == ( const MultiVectorT& vector ) const
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to compare two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -195,7 +195,7 @@ MultiVector< 4, Real, Device, Index >&
    MultiVector< 4, Real, Device, Index > :: operator = ( const MultiVector< 4, Real, Device, Index >& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
@@ -209,7 +209,7 @@ MultiVector< 4, Real, Device, Index >&
    MultiVector< 4, Real, Device, Index > :: operator = ( const MultiVectorT& vector )
 {
    // TODO: Static assert on dimensions
-   Assert( this->getDimensions() == vector. getDimensions(),
+   TNL_ASSERT( this->getDimensions() == vector. getDimensions(),
               std::cerr << "You are attempting to assign two Vectors with different dimensions." << std::endl
                    << "First vector dimensions are ( " << this->getDimensions() << " )" << std::endl
                    << "Second vector dimensions are ( " << vector. getDimensions() << " )" << std::endl; );
diff --git a/src/TNL/Containers/SharedArray_impl.h b/src/TNL/Containers/SharedArray_impl.h
index 60eeec9359382d5035f62cdb227bf3ab2a3fd273..558552a803baee2f454f617538d8a8b2d892d08f 100644
--- a/src/TNL/Containers/SharedArray_impl.h
+++ b/src/TNL/Containers/SharedArray_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/File.h>
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/StaticArray.h>
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Math.h>
 #include <TNL/param-types.h>
 
@@ -104,10 +104,10 @@ __cuda_callable__
 void SharedArray< Element, Device, Index > :: bind( Element* data,
                                                        const Index size )
 {
-   Assert( size >= 0,
+   TNL_ASSERT( size >= 0,
               std::cerr << "You try to set size of SharedArray to negative value."
                         << "New size: " << size << std::endl );
-   Assert( data != 0,
+   TNL_ASSERT( data != 0,
               std::cerr << "You try to use null pointer to data for SharedArray." );
 
    this->size = size;
@@ -123,7 +123,7 @@ void SharedArray< Element, Device, Index > :: bind( Array& array,
                                                        IndexType index,
                                                        IndexType size )
 {
-   //tnlStaticAssert( Array::DeviceType::DeviceType == DeviceType::DeviceType,
+   //tnlStaticTNL_ASSERT( Array::DeviceType::DeviceType == DeviceType::DeviceType,
    //                 "Attempt to bind arrays between different devices." );
    // TODO: fix this - it does nto work with StaticArray
    this->data = &( array. getData()[ index ] );
@@ -187,11 +187,11 @@ template< typename Element,
           typename Index >
 void SharedArray< Element, Device, Index > :: setElement( const Index& i, const Element& x )
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for setElement method in SharedArray "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
-   return ArrayOperations< Device >::setMemoryElement( & ( this->data[ i ] ), x );
+   return Algorithms::ArrayOperations< Device >::setMemoryElement( & ( this->data[ i ] ), x );
 };
 
 template< typename Element,
@@ -199,11 +199,11 @@ template< typename Element,
           typename Index >
 Element SharedArray< Element, Device, Index > :: getElement( const Index& i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for getElement method in SharedArray "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
-   return ArrayOperations< Device >::getMemoryElement( &( this->data[ i ] ) );
+   return Algorithms::ArrayOperations< Device >::getMemoryElement( &( this->data[ i ] ) );
 };
 
 template< typename Element,
@@ -212,7 +212,7 @@ template< typename Element,
 __cuda_callable__
 Element& SharedArray< Element, Device, Index > :: operator[] ( const Index& i )
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for operator[] in SharedArray "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
@@ -225,7 +225,7 @@ template< typename Element,
 __cuda_callable__
 const Element& SharedArray< Element, Device, Index > :: operator[] ( const Index& i ) const
 {
-   Assert( 0 <= i && i < this->getSize(),
+   TNL_ASSERT( 0 <= i && i < this->getSize(),
               std::cerr << "Wrong index for operator[] in SharedArray "
                         << " index is " << i
                         << " and array size is " << this->getSize() );
@@ -238,16 +238,16 @@ template< typename Element,
 SharedArray< Element, Device, Index >&
     SharedArray< Element, Device, Index > :: operator = ( const SharedArray< Element, Device, Index >& array )
 {
-   Assert( array. getSize() == this->getSize(),
+   TNL_ASSERT( array. getSize() == this->getSize(),
               std::cerr << "Source size: " << array. getSize() << std::endl
                         << "Target size: " << this->getSize() << std::endl );
-   ArrayOperations< Device > ::
-   template copyMemory< Element,
-                        Element,
-                        Index >
-                       ( this->getData(),
-                         array. getData(),
-                         array. getSize() );
+   Algorithms::ArrayOperations< Device > ::
+      template copyMemory< Element,
+                           Element,
+                           Index >
+                          ( this->getData(),
+                            array. getData(),
+                            array. getSize() );
    return ( *this );
 };
 
@@ -257,17 +257,16 @@ template< typename Element,
    template< typename Array >
 SharedArray< Element, Device, Index >& SharedArray< Element, Device, Index > :: operator = ( const Array& array )
 {
-   Assert( array. getSize() == this->getSize(),
+   TNL_ASSERT( array. getSize() == this->getSize(),
               std::cerr << "Source size: " << array. getSize() << std::endl
                         << "Target size: " << this->getSize() << std::endl );
-   ArrayOperations< typename Array :: DeviceType,
-                       Device > ::
-    template copyMemory< Element,
-                         typename Array :: ElementType,
-                         typename Array :: IndexType >
-                       ( this->getData(),
-                         array. getData(),
-                         array. getSize() );
+   Algorithms::ArrayOperations< typename Array::DeviceType, Device >::
+      template copyMemory< Element,
+                           typename Array :: ElementType,
+                           typename Array :: IndexType >
+                         ( this->getData(),
+                           array. getData(),
+                           array. getSize() );
    return ( *this );
 };
 
@@ -279,14 +278,13 @@ bool SharedArray< Element, Device, Index > :: operator == ( const Array& array )
 {
    if( array. getSize() != this->getSize() )
       return false;
-   return ArrayOperations< Device,
-                              typename Array :: DeviceType > ::
-    template compareMemory< typename Array :: ElementType,
-                            Element,
-                            typename Array :: IndexType >
-                          ( this->getData(),
-                            array. getData(),
-                            array. getSize() );
+   return Algorithms::ArrayOperations< Device, typename Array::DeviceType >::
+      template compareMemory< typename Array :: ElementType,
+                              Element,
+                              typename Array :: IndexType >
+                            ( this->getData(),
+                              array. getData(),
+                              array. getSize() );
 }
 
 template< typename Element,
@@ -303,8 +301,8 @@ template< typename Element,
           typename Index >
 void SharedArray< Element, Device, Index > :: setValue( const Element& e )
 {
-   Assert( this->size != 0, );
-   ArrayOperations< Device >::template setMemory< Element, Index >
+   TNL_ASSERT( this->size != 0, );
+   Algorithms::ArrayOperations< Device >::template setMemory< Element, Index >
                               ( this->getData(), e, this->getSize() );
 
 }
@@ -312,6 +310,7 @@ void SharedArray< Element, Device, Index > :: setValue( const Element& e )
 template< typename Element,
           typename Device,
           typename Index >
+__cuda_callable__ 
 const Element* SharedArray< Element, Device, Index > :: getData() const
 {
    return this->data;
@@ -320,6 +319,7 @@ const Element* SharedArray< Element, Device, Index > :: getData() const
 template< typename Element,
           typename Device,
           typename Index >
+__cuda_callable__ 
 Element* SharedArray< Element, Device, Index > :: getData()
 {
    return this->data;
@@ -348,7 +348,7 @@ template< typename Element,
           typename Index >
 bool SharedArray< Element, Device, Index > :: save( File& file ) const
 {
-   Assert( this->size != 0,
+   TNL_ASSERT( this->size != 0,
               std::cerr << "You try to save empty array." << std::endl );
    if( ! Object :: save( file ) )
       return false;
diff --git a/src/TNL/Containers/SharedVector_impl.h b/src/TNL/Containers/SharedVector_impl.h
index d086bfc6426a2d6e6ceffda40c73beeb67ad7613..869f0aeae455b9b3891d9147f8d3ea4285503a02 100644
--- a/src/TNL/Containers/SharedVector_impl.h
+++ b/src/TNL/Containers/SharedVector_impl.h
@@ -10,7 +10,8 @@
 
 #pragma once
 
-#include <TNL/Containers/VectorOperations.h>
+#include <TNL/Containers/SharedVector.h>
+#include <TNL/Containers/Algorithms/VectorOperations.h>
 
 namespace TNL {
 namespace Containers {   
@@ -92,7 +93,7 @@ template< typename Real,
 void SharedVector< Real, Device, Index >::addElement( const IndexType i,
                                                          const RealType& value )
 {
-   VectorOperations< Device >::addElement( *this, i, value );
+   Algorithms::VectorOperations< Device >::addElement( *this, i, value );
 }
 
 template< typename Real,
@@ -102,7 +103,7 @@ void SharedVector< Real, Device, Index >::addElement( const IndexType i,
                                                          const RealType& value,
                                                          const RealType& thisElementMultiplicator )
 {
-   VectorOperations< Device >::addElement( *this, i, value, thisElementMultiplicator );
+   Algorithms::VectorOperations< Device >::addElement( *this, i, value, thisElementMultiplicator );
 }
 
 template< typename Real,
@@ -169,7 +170,7 @@ template< typename Real,
           typename Index >
 SharedVector< Real, Device, Index >& SharedVector< Real, Device, Index > :: operator *= ( const RealType& c )
 {
-   VectorOperations< Device >::vectorScalarMultiplication( *this, c );
+   Algorithms::VectorOperations< Device >::vectorScalarMultiplication( *this, c );
    return *this;
 }
 
@@ -178,7 +179,7 @@ template< typename Real,
           typename Index >
 SharedVector< Real, Device, Index >& SharedVector< Real, Device, Index > :: operator /= ( const RealType& c )
 {
-   VectorOperations< Device >::vectorScalarMultiplication( *this, 1.0/ c );
+   Algorithms::VectorOperations< Device >::vectorScalarMultiplication( *this, 1.0/ c );
    return *this;
 }
 
@@ -187,7 +188,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: max() const
 {
-   return VectorOperations< Device > :: getVectorMax( *this );
+   return Algorithms::VectorOperations< Device > :: getVectorMax( *this );
 }
 
 template< typename Real,
@@ -195,7 +196,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: min() const
 {
-   return VectorOperations< Device > :: getVectorMin( *this );
+   return Algorithms::VectorOperations< Device > :: getVectorMin( *this );
 }
 
 
@@ -204,7 +205,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: absMax() const
 {
-   return VectorOperations< Device > :: getVectorAbsMax( *this );
+   return Algorithms::VectorOperations< Device > :: getVectorAbsMax( *this );
 }
 
 template< typename Real,
@@ -212,7 +213,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: absMin() const
 {
-   return VectorOperations< Device > :: getVectorAbsMin( *this );
+   return Algorithms::VectorOperations< Device > :: getVectorAbsMin( *this );
 }
 
 template< typename Real,
@@ -220,7 +221,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: lpNorm( const Real& p ) const
 {
-   return VectorOperations< Device > :: getVectorLpNorm( *this, p );
+   return Algorithms::VectorOperations< Device > :: getVectorLpNorm( *this, p );
 }
 
 
@@ -229,7 +230,7 @@ template< typename Real,
           typename Index >
 Real SharedVector< Real, Device, Index > :: sum() const
 {
-   return VectorOperations< Device > :: getVectorSum( *this );
+   return Algorithms::VectorOperations< Device > :: getVectorSum( *this );
 }
 
 
@@ -239,7 +240,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceMax( const Vector& v ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceMax( *this, v );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceMax( *this, v );
 }
 
 
@@ -249,7 +250,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceMin( const Vector& v ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceMin( *this, v );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceMin( *this, v );
 }
 
 
@@ -259,7 +260,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceAbsMax( const Vector& v ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceAbsMax( *this, v );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceAbsMax( *this, v );
 }
 
 template< typename Real,
@@ -268,7 +269,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceAbsMin( const Vector& v ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceAbsMin( *this, v );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceAbsMin( *this, v );
 }
 
 template< typename Real,
@@ -277,7 +278,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceLpNorm( const Vector& v, const Real& p ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceLpNorm( *this, v, p );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceLpNorm( *this, v, p );
 }
 
 
@@ -287,7 +288,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: differenceSum( const Vector& v ) const
 {
-   return VectorOperations< Device > :: getVectorDifferenceSum( *this, v );
+   return Algorithms::VectorOperations< Device > :: getVectorDifferenceSum( *this, v );
 }
 
 
@@ -296,7 +297,7 @@ template< typename Real,
           typename Index >
 void SharedVector< Real, Device, Index > :: scalarMultiplication( const Real& alpha )
 {
-   VectorOperations< Device > :: vectorScalarMultiplication( *this, alpha );
+   Algorithms::VectorOperations< Device > :: vectorScalarMultiplication( *this, alpha );
 }
 
 
@@ -306,7 +307,7 @@ template< typename Real,
 template< typename Vector >
 Real SharedVector< Real, Device, Index > :: scalarProduct( const Vector& v )
 {
-   return VectorOperations< Device > :: getScalarProduct( *this, v );
+   return Algorithms::VectorOperations< Device > :: getScalarProduct( *this, v );
 }
 
 template< typename Real,
@@ -317,7 +318,7 @@ void SharedVector< Real, Device, Index > :: addVector( const Vector& x,
                                                           const Real& alpha,
                                                           const Real& thisMultiplicator )
 {
-   VectorOperations< Device > :: addVector( *this, x, alpha, thisMultiplicator );
+   Algorithms::VectorOperations< Device > :: addVector( *this, x, alpha, thisMultiplicator );
 }
 
 template< typename Real,
@@ -332,7 +333,7 @@ addVectors( const Vector& v1,
             const Real& multiplicator2,
             const Real& thisMultiplicator )
 {
-   VectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
+   Algorithms::VectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
 template< typename Real,
@@ -340,7 +341,7 @@ template< typename Real,
           typename Index >
 void SharedVector< Real, Device, Index > :: computePrefixSum()
 {
-   VectorOperations< Device >::computePrefixSum( *this, 0, this->getSize() );
+   Algorithms::VectorOperations< Device >::computePrefixSum( *this, 0, this->getSize() );
 }
 
 template< typename Real,
@@ -349,7 +350,7 @@ template< typename Real,
 void SharedVector< Real, Device, Index > :: computePrefixSum( const IndexType begin,
                                                                  const IndexType end )
 {
-   VectorOperations< Device >::computePrefixSum( *this, begin, end );
+   Algorithms::VectorOperations< Device >::computePrefixSum( *this, begin, end );
 }
 
 template< typename Real,
@@ -357,7 +358,7 @@ template< typename Real,
           typename Index >
 void SharedVector< Real, Device, Index > :: computeExclusivePrefixSum()
 {
-   VectorOperations< Device >::computeExclusivePrefixSum( *this, 0, this->getSize() );
+   Algorithms::VectorOperations< Device >::computeExclusivePrefixSum( *this, 0, this->getSize() );
 }
 
 template< typename Real,
@@ -366,7 +367,7 @@ template< typename Real,
 void SharedVector< Real, Device, Index > :: computeExclusivePrefixSum( const IndexType begin,
                                                                           const IndexType end )
 {
-   VectorOperations< Device >::computeExclusivePrefixSum( *this, begin, end );
+   Algorithms::VectorOperations< Device >::computeExclusivePrefixSum( *this, begin, end );
 }
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
diff --git a/src/TNL/Containers/StaticArray1D_impl.h b/src/TNL/Containers/StaticArray1D_impl.h
index 9165686cf1298c81538822994d38b7d25d3b55d4..0b368dbf58c188611fa32173cfb49b6210f98f35 100644
--- a/src/TNL/Containers/StaticArray1D_impl.h
+++ b/src/TNL/Containers/StaticArray1D_impl.h
@@ -77,7 +77,7 @@ template< typename Element >
 __cuda_callable__
 inline const Element& StaticArray< 1, Element >::operator[]( int i ) const
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
@@ -86,7 +86,7 @@ template< typename Element >
 __cuda_callable__
 inline Element& StaticArray< 1, Element >::operator[]( int i )
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
diff --git a/src/TNL/Containers/StaticArray2D_impl.h b/src/TNL/Containers/StaticArray2D_impl.h
index fd9e53b5cfca4ebde411996f1d80eea4a83d1873..a0301a4bbf13f19abe1f78577de2c45cc2ece4c7 100644
--- a/src/TNL/Containers/StaticArray2D_impl.h
+++ b/src/TNL/Containers/StaticArray2D_impl.h
@@ -89,7 +89,7 @@ template< typename Element >
 __cuda_callable__
 inline const Element& StaticArray< 2, Element >::operator[]( int i ) const
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
@@ -98,7 +98,7 @@ template< typename Element >
 __cuda_callable__
 inline Element& StaticArray< 2, Element >::operator[]( int i )
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
diff --git a/src/TNL/Containers/StaticArray3D_impl.h b/src/TNL/Containers/StaticArray3D_impl.h
index d11b05bbcc8699ab6d9aff1beab9f51e2bb278c7..a246b2e3e6e2e1a26bca5c7c922a0b0371fb3dc1 100644
--- a/src/TNL/Containers/StaticArray3D_impl.h
+++ b/src/TNL/Containers/StaticArray3D_impl.h
@@ -68,6 +68,7 @@ String StaticArray< 3, Element >::getType()
 }
 
 template< typename Element >
+__cuda_callable__
 inline int StaticArray< 3, Element >::getSize() const
 {
    return size;
@@ -91,7 +92,7 @@ template< typename Element >
 __cuda_callable__
 inline const Element& StaticArray< 3, Element >::operator[]( int i ) const
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
@@ -100,7 +101,7 @@ template< typename Element >
 __cuda_callable__
 inline Element& StaticArray< 3, Element >::operator[]( int i )
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
diff --git a/src/TNL/Containers/StaticArray_impl.h b/src/TNL/Containers/StaticArray_impl.h
index 5e573623602592833e42e7654112f958749fee80..d651749e0f8f46012f7b968e0f8c5e6f0d1fc620 100644
--- a/src/TNL/Containers/StaticArray_impl.h
+++ b/src/TNL/Containers/StaticArray_impl.h
@@ -81,7 +81,7 @@ template< int Size, typename Element >
 __cuda_callable__
 inline const Element& StaticArray< Size, Element >::operator[]( int i ) const
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
@@ -90,7 +90,7 @@ template< int Size, typename Element >
 __cuda_callable__
 inline Element& StaticArray< Size, Element >::operator[]( int i )
 {
-   Assert( i >= 0 && i < size,
+   TNL_ASSERT( i >= 0 && i < size,
             std::cerr << "i = " << i << " size = " << size << std::endl; );
    return data[ i ];
 }
diff --git a/src/TNL/Containers/StaticVector.h b/src/TNL/Containers/StaticVector.h
index d3a10c9865bed431f05c6d30990b27c9fa14e018..2b0e7c7fdeb013e47d578a4ff33053a05aecde2e 100644
--- a/src/TNL/Containers/StaticVector.h
+++ b/src/TNL/Containers/StaticVector.h
@@ -306,9 +306,9 @@ class StaticVector< 3, Real > : public Containers::StaticArray< 3, Real >
    ThisType abs() const;
 };
 
-template< int Size, typename Real >
+template< int Size, typename Real, typename Scalar >
 __cuda_callable__
-StaticVector< Size, Real > operator * ( const Real& c, const StaticVector< Size, Real >& u );
+StaticVector< Size, Real > operator * ( const Scalar& c, const StaticVector< Size, Real >& u );
 
 template< int Size, typename Real >
 __cuda_callable__
@@ -377,10 +377,10 @@ Real tnlTriangleArea( const StaticVector< 3, Real >& a,
    StaticVector< 3, Real > u1, u2;
    u1. x() = b. x() - a. x();
    u1. y() = b. y() - a. y();
-   u1. z() = 0.0;
+   u1. z() = b. z() - a. z();
    u2. x() = c. x() - a. x();
    u2. y() = c. y() - a. y();
-   u2. z() = 0;
+   u2. z() = c. z() - a. z();
 
    const StaticVector< 3, Real > v = VectorProduct( u1, u2 );
    return 0.5 * ::sqrt( tnlScalarProduct( v, v ) );
diff --git a/src/TNL/Containers/StaticVector_impl.h b/src/TNL/Containers/StaticVector_impl.h
index 251c1e7369e6c20e141ef3fa2fb675d217be4654..32b8f4b95ece7bdc9c6e0a435c072e17e58e8979 100644
--- a/src/TNL/Containers/StaticVector_impl.h
+++ b/src/TNL/Containers/StaticVector_impl.h
@@ -181,8 +181,9 @@ StaticVector< Size, Real >::abs() const
 }
 
 
-template< int Size, typename Real >
-StaticVector< Size, Real > operator * ( const Real& c, const StaticVector< Size, Real >& u )
+template< int Size, typename Real, typename Scalar >
+__cuda_callable__
+StaticVector< Size, Real > operator * ( const Scalar& c, const StaticVector< Size, Real >& u )
 {
    return u * c;
 }
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 0a73dd6988a0ecf687f0a2d6194d329b0cf7b7b9..01a6a3682f5d7779ec227732e0d682c5ef454109 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -11,22 +11,13 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Functions/Domain.h>
 
 namespace TNL {
-
-namespace Devices
-{ 
-   class Host;
-}
-
 namespace Containers {   
 
-
-
 template< typename Real = double,
-           typename Device = Devices::Host,
-           typename Index = int >
+          typename Device = Devices::Host,
+          typename Index = int >
 class Vector : public Containers::Array< Real, Device, Index >
 {
    public:
diff --git a/src/TNL/Containers/Vector_impl.h b/src/TNL/Containers/Vector_impl.h
index b602d003204f3de8417c323f90bfe50ac71b06fc..b5a233faa6be107c2f7f776b7a66f58c998eaeae 100644
--- a/src/TNL/Containers/Vector_impl.h
+++ b/src/TNL/Containers/Vector_impl.h
@@ -10,7 +10,8 @@
 
 #pragma once 
 
-#include <TNL/Containers/VectorOperations.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Algorithms/VectorOperations.h>
 
 namespace TNL {
 namespace Containers {   
@@ -18,15 +19,16 @@ namespace Containers {
 template< typename Real,
           typename Device,
           typename Index >
-Vector< Real, Device, Index >::Vector()
+Vector< Real, Device, Index >::
+Vector()
 {
-
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-Vector< Real, Device, Index >::Vector( const Index size )
+Vector< Real, Device, Index >::
+Vector( const Index size )
 {
    this->setSize( size );
 }
@@ -35,18 +37,22 @@ Vector< Real, Device, Index >::Vector( const Index size )
 template< typename Real,
           typename Device,
           typename Index >
-String Vector< Real, Device, Index >::getType()
+String
+Vector< Real, Device, Index >::
+getType()
 {
    return String( "Containers::Vector< " ) +
-                    TNL::getType< Real >() + ", " +
-                     Device::getDeviceType() + ", " +
-                    TNL::getType< Index >() + " >";
+                  TNL::getType< Real >() + ", " +
+                  Device::getDeviceType() + ", " +
+                  TNL::getType< Index >() + " >";
 };
 
 template< typename Real,
           typename Device,
           typename Index >
-String Vector< Real, Device, Index >::getTypeVirtual() const
+String
+Vector< Real, Device, Index >::
+getTypeVirtual() const
 {
    return this->getType();
 };
@@ -54,7 +60,9 @@ String Vector< Real, Device, Index >::getTypeVirtual() const
 template< typename Real,
           typename Device,
           typename Index >
-String Vector< Real, Device, Index >::getSerializationType()
+String
+Vector< Real, Device, Index >::
+getSerializationType()
 {
    return HostType::getType();
 };
@@ -62,7 +70,9 @@ String Vector< Real, Device, Index >::getSerializationType()
 template< typename Real,
           typename Device,
           typename Index >
-String Vector< Real, Device, Index >::getSerializationTypeVirtual() const
+String
+Vector< Real, Device, Index >::
+getSerializationTypeVirtual() const
 {
    return this->getSerializationType();
 };
@@ -70,27 +80,32 @@ String Vector< Real, Device, Index >::getSerializationTypeVirtual() const
 template< typename Real,
           typename Device,
           typename Index >
-void Vector< Real, Device, Index >::addElement( const IndexType i,
-                                                   const RealType& value )
+void
+Vector< Real, Device, Index >::
+addElement( const IndexType i,
+            const RealType& value )
 {
-   VectorOperations< Device >::addElement( *this, i, value );
+   Algorithms::VectorOperations< Device >::addElement( *this, i, value );
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-void Vector< Real, Device, Index >::addElement( const IndexType i,
-                                                   const RealType& value,
-                                                   const RealType& thisElementMultiplicator )
+void
+Vector< Real, Device, Index >::
+addElement( const IndexType i,
+            const RealType& value,
+            const RealType& thisElementMultiplicator )
 {
-   VectorOperations< Device >::addElement( *this, i, value, thisElementMultiplicator );
+   Algorithms::VectorOperations< Device >::addElement( *this, i, value, thisElementMultiplicator );
 }
 
 template< typename Real,
-           typename Device,
-           typename Index >
+          typename Device,
+          typename Index >
 Vector< Real, Device, Index >&
-   Vector< Real, Device, Index >::operator = ( const Vector< Real, Device, Index >& vector )
+Vector< Real, Device, Index >::
+operator = ( const Vector< Real, Device, Index >& vector )
 {
    Containers::Array< Real, Device, Index >::operator = ( vector );
    return ( *this );
@@ -101,7 +116,8 @@ template< typename Real,
            typename Index >
    template< typename VectorT >
 Vector< Real, Device, Index >&
-   Vector< Real, Device, Index >::operator = ( const VectorT& vector )
+Vector< Real, Device, Index >::
+operator = ( const VectorT& vector )
 {
    Containers::Array< Real, Device, Index >::operator = ( vector );
    return ( *this );
@@ -111,7 +127,9 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename VectorT >
-bool Vector< Real, Device, Index >::operator == ( const VectorT& vector ) const
+bool
+Vector< Real, Device, Index >::
+operator == ( const VectorT& vector ) const
 {
    return Containers::Array< Real, Device, Index >::operator == ( vector );
 }
@@ -120,7 +138,9 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename VectorT >
-bool Vector< Real, Device, Index >::operator != ( const VectorT& vector ) const
+bool
+Vector< Real, Device, Index >::
+operator != ( const VectorT& vector ) const
 {
    return Containers::Array< Real, Device, Index >::operator != ( vector );
 }
@@ -129,7 +149,9 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename VectorT >
-Vector< Real, Device, Index >& Vector< Real, Device, Index >::operator -= ( const VectorT& vector )
+Vector< Real, Device, Index >&
+Vector< Real, Device, Index >::
+operator -= ( const VectorT& vector )
 {
    this->addVector( vector, -1.0 );
    return *this;
@@ -139,7 +161,9 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename VectorT >
-Vector< Real, Device, Index >& Vector< Real, Device, Index >::operator += ( const VectorT& vector )
+Vector< Real, Device, Index >&
+Vector< Real, Device, Index >::
+operator += ( const VectorT& vector )
 {
    this->addVector( vector );
    return *this;
@@ -148,18 +172,22 @@ Vector< Real, Device, Index >& Vector< Real, Device, Index >::operator += ( cons
 template< typename Real,
           typename Device,
           typename Index >
-Vector< Real, Device, Index >& Vector< Real, Device, Index >::operator *= ( const RealType& c )
+Vector< Real, Device, Index >&
+Vector< Real, Device, Index >::
+operator *= ( const RealType& c )
 {
-   VectorOperations< Device >::vectorScalarMultiplication( *this, c );
+   Algorithms::VectorOperations< Device >::vectorScalarMultiplication( *this, c );
    return *this;
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-Vector< Real, Device, Index >& Vector< Real, Device, Index >::operator /= ( const RealType& c )
+Vector< Real, Device, Index >&
+Vector< Real, Device, Index >::
+operator /= ( const RealType& c )
 {
-   VectorOperations< Device >::vectorScalarMultiplication( *this, 1.0 / c );
+   Algorithms::VectorOperations< Device >::vectorScalarMultiplication( *this, 1.0 / c );
    return *this;
 }
 
@@ -169,7 +197,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::max() const
 {
-   return VectorOperations< Device >::getVectorMax( *this );
+   return Algorithms::VectorOperations< Device >::getVectorMax( *this );
 }
 
 template< typename Real,
@@ -177,7 +205,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::min() const
 {
-   return VectorOperations< Device >::getVectorMin( *this );
+   return Algorithms::VectorOperations< Device >::getVectorMin( *this );
 }
 
 
@@ -186,7 +214,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::absMax() const
 {
-   return VectorOperations< Device >::getVectorAbsMax( *this );
+   return Algorithms::VectorOperations< Device >::getVectorAbsMax( *this );
 }
 
 template< typename Real,
@@ -194,7 +222,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::absMin() const
 {
-   return VectorOperations< Device >::getVectorAbsMin( *this );
+   return Algorithms::VectorOperations< Device >::getVectorAbsMin( *this );
 }
 
 template< typename Real,
@@ -202,7 +230,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::lpNorm( const Real& p ) const
 {
-   return VectorOperations< Device >::getVectorLpNorm( *this, p );
+   return Algorithms::VectorOperations< Device >::getVectorLpNorm( *this, p );
 }
 
 
@@ -211,7 +239,7 @@ template< typename Real,
           typename Index >
 Real Vector< Real, Device, Index >::sum() const
 {
-   return VectorOperations< Device >::getVectorSum( *this );
+   return Algorithms::VectorOperations< Device >::getVectorSum( *this );
 }
 
 
@@ -221,7 +249,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceMax( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceMax( *this, v );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceMax( *this, v );
 }
 
 
@@ -231,7 +259,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceMin( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceMin( *this, v );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceMin( *this, v );
 }
 
 
@@ -241,7 +269,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceAbsMax( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceAbsMax( *this, v );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceAbsMax( *this, v );
 }
 
 template< typename Real,
@@ -250,7 +278,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceAbsMin( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceAbsMin( *this, v );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceAbsMin( *this, v );
 }
 
 template< typename Real,
@@ -259,7 +287,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceLpNorm( const VectorT& v, const Real& p ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceLpNorm( *this, v, p );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceLpNorm( *this, v, p );
 }
 
 
@@ -269,7 +297,7 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::differenceSum( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getVectorDifferenceSum( *this, v );
+   return Algorithms::VectorOperations< Device >::getVectorDifferenceSum( *this, v );
 }
 
 
@@ -278,7 +306,7 @@ template< typename Real,
           typename Index >
 void Vector< Real, Device, Index >::scalarMultiplication( const Real& alpha )
 {
-   VectorOperations< Device >::vectorScalarMultiplication( *this, alpha );
+   Algorithms::VectorOperations< Device >::vectorScalarMultiplication( *this, alpha );
 }
 
 
@@ -288,18 +316,20 @@ template< typename Real,
 template< typename VectorT >
 Real Vector< Real, Device, Index >::scalarProduct( const VectorT& v ) const
 {
-   return VectorOperations< Device >::getScalarProduct( *this, v );
+   return Algorithms::VectorOperations< Device >::getScalarProduct( *this, v );
 }
 
 template< typename Real,
           typename Device,
           typename Index >
 template< typename VectorT >
-void Vector< Real, Device, Index >::addVector( const VectorT& x,
-                                                    const Real& multiplicator,
-                                                    const Real& thisMultiplicator )
+void
+Vector< Real, Device, Index >::
+addVector( const VectorT& x,
+           const Real& multiplicator,
+           const Real& thisMultiplicator )
 {
-   VectorOperations< Device >::addVector( *this, x, multiplicator, thisMultiplicator );
+   Algorithms::VectorOperations< Device >::addVector( *this, x, multiplicator, thisMultiplicator );
 }
 
 template< typename Real,
@@ -314,7 +344,7 @@ addVectors( const VectorT& v1,
             const Real& multiplicator2,
             const Real& thisMultiplicator )
 {
-   VectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
+   Algorithms::VectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
 template< typename Real,
@@ -322,16 +352,18 @@ template< typename Real,
           typename Index >
 void Vector< Real, Device, Index >::computePrefixSum()
 {
-   VectorOperations< Device >::computePrefixSum( *this, 0, this->getSize() );
+   Algorithms::VectorOperations< Device >::computePrefixSum( *this, 0, this->getSize() );
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-void Vector< Real, Device, Index >::computePrefixSum( const IndexType begin,
-                                                           const IndexType end )
+void
+Vector< Real, Device, Index >::
+computePrefixSum( const IndexType begin,
+                  const IndexType end )
 {
-   VectorOperations< Device >::computePrefixSum( *this, begin, end );
+   Algorithms::VectorOperations< Device >::computePrefixSum( *this, begin, end );
 }
 
 template< typename Real,
@@ -339,16 +371,18 @@ template< typename Real,
           typename Index >
 void Vector< Real, Device, Index >::computeExclusivePrefixSum()
 {
-   VectorOperations< Device >::computeExclusivePrefixSum( *this, 0, this->getSize() );
+   Algorithms::VectorOperations< Device >::computeExclusivePrefixSum( *this, 0, this->getSize() );
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-void Vector< Real, Device, Index >::computeExclusivePrefixSum( const IndexType begin,
-                                                                    const IndexType end )
+void
+Vector< Real, Device, Index >::
+computeExclusivePrefixSum( const IndexType begin,
+                           const IndexType end )
 {
-   VectorOperations< Device >::computeExclusivePrefixSum( *this, begin, end );
+   Algorithms::VectorOperations< Device >::computeExclusivePrefixSum( *this, begin, end );
 }
 
 
diff --git a/src/TNL/CudaStreamPool.h b/src/TNL/CudaStreamPool.h
index 780d95e4de983c363dfc042f424d57e895700a18..aa0a3e5d1e2ddee0d4416040ae9815f5242955f6 100644
--- a/src/TNL/CudaStreamPool.h
+++ b/src/TNL/CudaStreamPool.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          UniquePointer.h  -  description
+                             -------------------
+    begin                : Oct 14, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <stdlib.h>
diff --git a/src/TNL/Curve.h b/src/TNL/Curve.h
index ace8d622be4afa36cbde901b3d8847a358294f1f..6704b8a98865aabf36a4b5b8ce971c32c1802ff6 100644
--- a/src/TNL/Curve.h
+++ b/src/TNL/Curve.h
@@ -13,7 +13,7 @@
 #include <iomanip>
 #include <fstream>
 #include <cstring>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/Object.h>
 #include <TNL/Math.h>
 #include <TNL/Containers/StaticVector.h>
@@ -22,7 +22,8 @@
 namespace TNL {
 
 //! Basic structure for curves
-template< class T > class CurveElement
+template< class T >
+class CurveElement
 {
    public:
    CurveElement() {};
@@ -71,7 +72,10 @@ template< class T > class CurveElement
    bool separator;
 };
 
-template< class T > class Curve : public Object, public List< CurveElement< T > >
+template< class T >
+class Curve
+ : public Object,
+   public Containers::List< CurveElement< T > >
 {
    public:
    //! Basic contructor
@@ -94,20 +98,20 @@ template< class T > class Curve : public Object, public List< CurveElement< T >
    //! Append new point
    void Append( const T& vec, bool separator = false )
    {
-      List< CurveElement< T > > :: Append( CurveElement< T >( vec, separator ) );
+      Containers::List< CurveElement< T > > :: Append( CurveElement< T >( vec, separator ) );
    };
 
    //! Erase the curve
    void Erase()
    {
-      List< CurveElement< T > >::reset();
+      Containers::List< CurveElement< T > >::reset();
    };
  
    //! Method for saving the object to a file as a binary data
    bool save( File& file ) const
    {
       if( ! Object :: save( file ) ) return false;
-      if( ! List< CurveElement< T > > :: DeepSave( file ) ) return false;
+      if( ! Containers::List< CurveElement< T > > :: DeepSave( file ) ) return false;
       return true;
    };
 
@@ -115,7 +119,7 @@ template< class T > class Curve : public Object, public List< CurveElement< T >
    bool load( File& file )
    {
       if( ! Object :: load( file ) ) return false;
-      if( ! List< CurveElement< T > > :: DeepLoad( file ) ) return false;
+      if( ! Containers::List< CurveElement< T > > :: DeepLoad( file ) ) return false;
       return true;
    };
 
diff --git a/src/TNL/Debugging/CMakeLists.txt b/src/TNL/Debugging/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..d689cdaa12eef26c1feb3a061223a2d32271b58f
--- /dev/null
+++ b/src/TNL/Debugging/CMakeLists.txt
@@ -0,0 +1,7 @@
+SET( headers
+         FPE.h
+         MemoryUsage.h
+         StackBacktrace.h
+)
+   
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/TNL/Debugging )
diff --git a/src/TNL/Debugging/FPE.h b/src/TNL/Debugging/FPE.h
new file mode 100644
index 0000000000000000000000000000000000000000..09011e8953205089db602357ad9840965dc95ede
--- /dev/null
+++ b/src/TNL/Debugging/FPE.h
@@ -0,0 +1,69 @@
+/***************************************************************************
+                          MeshConfigBase.h  -  description
+                             -------------------
+    begin                : Nov 6, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <cfenv>
+#include <signal.h>
+
+#include <TNL/Debugging/StackBacktrace.h>
+
+namespace TNL {
+namespace Debugging {
+
+static void
+printStackBacktraceAndAbort( int sig = 0 )
+{
+   if( sig == SIGSEGV )
+      fprintf(stderr, "Invalid memory reference, printing backtrace and aborting...\n");
+   else if( sig == SIGFPE ) {
+      /*
+       * Unfortunately it is not possible to get the floating-point exception type
+       * from a signal handler. Otherwise, it would be done this way:
+       *
+       *    fprintf(stderr, "Floating-point exception");
+       *    if(fetestexcept(FE_DIVBYZERO))  fprintf(stderr, " FE_DIVBYZERO");
+       *    if(fetestexcept(FE_INEXACT))    fprintf(stderr, " FE_INEXACT");
+       *    if(fetestexcept(FE_INVALID))    fprintf(stderr, " FE_INVALID");
+       *    if(fetestexcept(FE_OVERFLOW))   fprintf(stderr, " FE_OVERFLOW");
+       *    if(fetestexcept(FE_UNDERFLOW))  fprintf(stderr, " FE_UNDERFLOW");
+       *    fprintf(stderr, " occurred, printing backtrace and aborting...\n");
+       */
+      fprintf(stderr, "Floating-point exception occurred, printing backtrace and aborting...\n");
+   }
+   else
+      fprintf( stderr, "Aborting due to signal %d...\n", sig );
+   printStackBacktrace();
+   abort();
+}
+
+/*
+ * Registers handler for SIGSEGV and SIGFPE signals and enables conversion of
+ * floating-point exceptions into SIGFPE. This is useful e.g. for tracing where
+ * NANs occurred. Example usage:
+ *
+ * int main()
+ * {
+ *    #ifndef NDEBUG
+ *       registerFloatingPointExceptionTracking()
+ *    #endif
+ *    [start some computation here...]
+ * }
+ */
+static void
+trackFloatingPointExceptions()
+{
+   signal( SIGSEGV, printStackBacktraceAndAbort );
+   signal( SIGFPE,  printStackBacktraceAndAbort );
+   feenableexcept( FE_ALL_EXCEPT & ~FE_INEXACT );
+}
+
+} // namespace Debugging
+} // namespace TNL
diff --git a/src/TNL/Debugging/MemoryUsage.h b/src/TNL/Debugging/MemoryUsage.h
new file mode 100644
index 0000000000000000000000000000000000000000..03ea3fe4d3db40f566ce82762521da1596e45dd5
--- /dev/null
+++ b/src/TNL/Debugging/MemoryUsage.h
@@ -0,0 +1,70 @@
+/***************************************************************************
+                          MeshConfigBase.h  -  description
+                             -------------------
+    begin                : Nov 6, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <limits>
+
+namespace TNL {
+namespace Debugging {
+
+/*
+ * Prints memory usage of the current process into the specified stream.
+ *
+ * The information is obtained from /proc/self/status, which is assumed to be
+ * present on the system. The meaning of the printed values is following:
+ *  
+ *  - VmSize: Virtual memory size.
+ *  - VmRSS: Resident set size.
+ *  - VmHWM: Peak resident set size ("high water mark").
+ *
+ * See the proc(5) manual on Linux for details.
+ */
+static void
+printMemoryUsage( std::ostream& str = std::cerr )
+{
+   std::ifstream meminfo("/proc/self/status");
+   if( meminfo.fail() ) {
+      std::cerr << "error: unable to open /proc/self/status" << std::endl;
+      return;
+   }
+
+   unsigned vm = 0;
+   unsigned rss = 0;
+   unsigned hwm = 0;
+
+   std::string desc;
+   while( meminfo.good() ) {
+       // extract description (first column)
+       meminfo >> desc;
+
+       if( desc == "VmSize:" )
+           meminfo >> vm;
+       if( desc == "VmHWM:" )
+           meminfo >> hwm;
+       if( desc == "VmRSS:" )
+           meminfo >> rss;
+
+       // ignore the rest of irrelevant lines
+       meminfo.ignore( std::numeric_limits< std::streamsize >::max(), '\n' );
+   }
+
+   str << "Memory usage (MiB): "
+       << "VmSize = " << vm / 1024 << "MiB, "
+       << "VmRSS = " << rss / 1024 << "MiB, "
+       << "VmHWM = " << hwm / 1024 << "MiB, "
+       << std::endl;
+}
+
+} // namespace Debugging
+} // namespace TNL
diff --git a/src/TNL/Debugging/StackBacktrace.h b/src/TNL/Debugging/StackBacktrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e576fd7219e6da8b4b7e3bfc5beab4510d42cc
--- /dev/null
+++ b/src/TNL/Debugging/StackBacktrace.h
@@ -0,0 +1,104 @@
+/***************************************************************************
+                          MeshConfigBase.h  -  description
+                             -------------------
+    begin                : Nov 6, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+
+namespace TNL {
+namespace Debugging {
+
+/*
+ * Print a demangled stack backtrace of the caller function to FILE* out.
+ *
+ * Reference: http://panthema.net/2008/0901-stacktrace-demangled/
+ *
+ * Note that the program must be linked with the -rdynamic flag, otherwise
+ * demangling will not work.
+ */
+static void
+printStackBacktrace( FILE *out = stderr, unsigned int max_frames = 63 )
+{
+   fprintf(out, "stack trace:\n");
+
+   // storage array for stack trace address data
+   void* addrlist[max_frames+1];
+
+   // retrieve current stack addresses
+   int addrlen = backtrace(addrlist, sizeof(addrlist) / sizeof(void*));
+
+   if (addrlen == 0) {
+      fprintf(out, "  <empty, possibly corrupt>\n");
+      return;
+   }
+
+   // resolve addresses into strings containing "filename(function+address)",
+   // this array must be free()-ed
+   char** symbollist = backtrace_symbols(addrlist, addrlen);
+
+   // allocate string which will be filled with the demangled function name
+   size_t funcnamesize = 256;
+   char* funcname = (char*)malloc(funcnamesize);
+
+   // iterate over the returned symbol lines. skip the first, it is the
+   // address of this function.
+   for (int i = 1; i < addrlen; i++) {
+      char *begin_name = 0, *begin_offset = 0, *end_offset = 0;
+
+      // find parentheses and +address offset surrounding the mangled name:
+      // ./module(function+0x15c) [0x8048a6d]
+      for (char *p = symbollist[i]; *p; ++p) {
+         if (*p == '(')
+            begin_name = p;
+         else if (*p == '+')
+            begin_offset = p;
+         else if (*p == ')' && begin_offset) {
+            end_offset = p;
+            break;
+         }
+      }
+
+      if (begin_name && begin_offset && end_offset && begin_name < begin_offset) {
+         *begin_name++ = '\0';
+         *begin_offset++ = '\0';
+         *end_offset = '\0';
+
+         // mangled name is now in [begin_name, begin_offset) and caller
+         // offset in [begin_offset, end_offset). now apply
+         // __cxa_demangle():
+
+         int status;
+         char* ret = abi::__cxa_demangle(begin_name, funcname, &funcnamesize, &status);
+         if (status == 0) {
+            funcname = ret; // use possibly realloc()-ed string
+            fprintf(out, "  %d %s : %s+%s\n",
+               i, symbollist[i], funcname, begin_offset);
+         }
+         else {
+            // demangling failed. Output function name as a C function with no arguments.
+            fprintf(out, "  %d %s : %s()+%s\n",
+               i, symbollist[i], begin_name, begin_offset);
+         }
+      }
+      else {
+         // couldn't parse the line? print the whole line.
+         fprintf(out, "  %d %s\n", i, symbollist[i]);
+      }
+   }
+
+   free(funcname);
+   free(symbollist);
+}
+
+} // namespace Debugging
+} // namespace TNL
diff --git a/src/TNL/DevicePointer.h b/src/TNL/DevicePointer.h
index beaed6e42ec47eac9626a53a0f365ae8e6d2f25d..3a8bd43ff2a03e390446156952fc6571be362454 100644
--- a/src/TNL/DevicePointer.h
+++ b/src/TNL/DevicePointer.h
@@ -1,20 +1,15 @@
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
 /***************************************************************************
                           DevicePointer.h  -  description
                              -------------------
     begin                : Sep 1, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <TNL/Devices/Host.h>
@@ -295,9 +290,9 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
       const Object& getData() const
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pointer, );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
             return *( this->pointer );
          if( std::is_same< Device, Devices::Cuda >::value )
@@ -309,9 +304,9 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
       Object& modifyData()
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pointer, );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
          {
             this->pd->maybe_modified = true;
@@ -380,8 +375,8 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #ifdef HAVE_CUDA
          if( this->modified() )
          {
-            Assert( this->pointer, );
-            Assert( this->cuda_pointer, );
+            TNL_ASSERT( this->pointer, );
+            TNL_ASSERT( this->cuda_pointer, );
             cudaMemcpy( (void*) this->cuda_pointer, (void*) this->pointer, sizeof( ObjectType ), cudaMemcpyHostToDevice );
             if( ! checkCudaDevice ) {
                return false;
@@ -428,16 +423,16 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 
       void set_last_sync_state()
       {
-         Assert( this->pointer, );
-         Assert( this->pd, );
+         TNL_ASSERT( this->pointer, );
+         TNL_ASSERT( this->pd, );
          std::memcpy( (void*) &this->pd->data_image, (void*) this->pointer, sizeof( Object ) );
          this->pd->maybe_modified = false;
       }
 
       bool modified()
       {
-         Assert( this->pointer, );
-         Assert( this->pd, );
+         TNL_ASSERT( this->pointer, );
+         TNL_ASSERT( this->pd, );
          // optimization: skip bitwise comparison if we're sure that the data is the same
          if( ! this->pd->maybe_modified )
             return false;
diff --git a/src/TNL/Devices/Cuda.cpp b/src/TNL/Devices/Cuda.cpp
index ef868455784575e3304ac4ad2416e6b856f38ce5..2c8f85aeca4c1fc27a76c14115c7e29cda8c4251 100644
--- a/src/TNL/Devices/Cuda.cpp
+++ b/src/TNL/Devices/Cuda.cpp
@@ -19,17 +19,13 @@ namespace TNL {
 namespace Devices {
 
 SmartPointersRegister Cuda::smartPointersRegister;
+Timer Cuda::smartPointersSynchronizationTimer;
 
 String Cuda::getDeviceType()
 {
    return String( "Cuda" );
 }
 
-int Cuda::getGPUTransferBufferSize()
-{
-   return 1 << 20;
-}
-
 int Cuda::getNumberOfBlocks( const int threads,
                              const int blockSize )
 {
@@ -42,14 +38,10 @@ int Cuda::getNumberOfGrids( const int blocks,
    return roundUpDivision( blocks, gridSize );
 }
 
-/*size_t Cuda::getFreeMemory()
-{
-
-}*/
-
 void Cuda::configSetup( Config::ConfigDescription& config,
                         const String& prefix )
 {
+// FIXME: HAVE_CUDA is never defined in .cpp files
 #ifdef HAVE_CUDA
    config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation.", 0 );
 #else
@@ -60,13 +52,16 @@ void Cuda::configSetup( Config::ConfigDescription& config,
 bool Cuda::setup( const Config::ParameterContainer& parameters,
                   const String& prefix )
 {
+// FIXME: HAVE_CUDA is never defined in .cpp files
 #ifdef HAVE_CUDA
-   int cudaDevice = parameters.getParameter< int >( "cuda-device" );
+   int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
    if( cudaSetDevice( cudaDevice ) != cudaSuccess )
    {
       std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
       return false;
    }
+   smartPointersSynchronizationTimer.reset();
+   smartPointersSynchronizationTimer.stop();
 #endif
    return true;
 }
@@ -85,7 +80,10 @@ bool Cuda::synchronizeDevice( int deviceId )
 {
    if( deviceId < 0 )
       deviceId = Devices::CudaDeviceInfo::getActiveDevice();
-   return smartPointersRegister.synchronizeDevice( deviceId );
+   smartPointersSynchronizationTimer.start();
+   bool b = smartPointersRegister.synchronizeDevice( deviceId );
+   smartPointersSynchronizationTimer.stop();
+   return b;
 }
 
 } // namespace Devices
diff --git a/src/TNL/Devices/Cuda.cu b/src/TNL/Devices/Cuda.cu
index 3d4ad065ddfa472ef50ca79bd3cd6fe43e49ce1b..d2ac6e6b6dc9fc3854a7c48c5b9b2eec7da79897 100644
--- a/src/TNL/Devices/Cuda.cu
+++ b/src/TNL/Devices/Cuda.cu
@@ -15,36 +15,6 @@
 namespace TNL {
 namespace Devices {
 
-/*void Cuda::configSetup( tnlConfigDescription& config, const String& prefix )
-{
-#ifdef HAVE_CUDA
-   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device.", 0 );
-#else
-   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device (CUDA is not supported on this system).", 0 );
-#endif
-}
- 
-bool Cuda::setup( const tnlParameterContainer& parameters,
-                    const String& prefix )
-{
-   int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
-#ifdef HAVE_CUDA
-    cudaSetDevice( cudaDevice );
-    checkCudaDevice;
-#endif
-   return true;
-}
-*/
-
-int Cuda::getDeviceId()
-{
-   int id( 0 );
-#ifdef HAVE_CUDA
-   cudaGetDevice( &id );
-#endif
-   return id;
-}
-
 bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
 {   
    if( error == cudaSuccess )
@@ -57,368 +27,371 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
       // 1
       case cudaErrorMissingConfiguration:
          std::cerr
-          << "The device function being invoked (usually via ::cudaLaunch()) was not " << std::endl
-          << "previously configured via the ::cudaConfigureCall() function. " << std::endl;
-       break;
+            << "The device function being invoked (usually via ::cudaLaunch()) was not " << std::endl
+            << "previously configured via the ::cudaConfigureCall() function. " << std::endl;
+         break;
 
       // 2
       case cudaErrorMemoryAllocation:
          std::cerr
-          << "The API call failed because it was unable to allocate enough memory to " << std::endl
-          << "perform the requested operation. " << std::endl;
-       break;
+            << "The API call failed because it was unable to allocate enough memory to " << std::endl
+            << "perform the requested operation. " << std::endl;
+         break;
 
       // 3
       case cudaErrorInitializationError:
          std::cerr
-          << "The API call failed because the CUDA driver and runtime could not be " << std::endl
-          << "initialized. " << std::endl;
-       break;
+            << "The API call failed because the CUDA driver and runtime could not be " << std::endl
+            << "initialized. " << std::endl;
+         break;
  
       // 4
       case cudaErrorLaunchFailure:
          std::cerr
-          << "An exception occurred on the device while executing a kernel. Common " << std::endl
-          << "causes include dereferencing an invalid device pointer and accessing " << std::endl
-          << "out of bounds shared memory. The device cannot be used until " << std::endl
-          << "::cudaThreadExit() is called. All existing device memory allocations " << std::endl
-          << "are invalid and must be reconstructed if the program is to continue " << std::endl
-          << "using CUDA. " << std::endl;
-       break;
+            << "An exception occurred on the device while executing a kernel. Common " << std::endl
+            << "causes include dereferencing an invalid device pointer and accessing " << std::endl
+            << "out of bounds shared memory. The device cannot be used until " << std::endl
+            << "::cudaThreadExit() is called. All existing device memory allocations " << std::endl
+            << "are invalid and must be reconstructed if the program is to continue " << std::endl
+            << "using CUDA. " << std::endl;
+         break;
 
       // 5
       case cudaErrorPriorLaunchFailure:
          std::cerr
-          << "This indicated that a previous kernel launch failed. This was previously " << std::endl
-          << "used for device emulation of kernel launches. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "This indicated that a previous kernel launch failed. This was previously " << std::endl
+            << "used for device emulation of kernel launches. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       // 6
       case cudaErrorLaunchTimeout:
          std::cerr
-          << "This indicates that the device kernel took too long to execute. This can " << std::endl
-          << "only occur if timeouts are enabled - see the device property " << std::endl
-          << "ref ::cudaDeviceProp::kernelExecTimeoutEnabled \"kernelExecTimeoutEnabled\" " << std::endl
-          << "for more information. The device cannot be used until ::cudaThreadExit() " << std::endl
-          << "is called. All existing device memory allocations are invalid and must be " << std::endl
-          << "reconstructed if the program is to continue using CUDA. " << std::endl;
-       break;
+            << "This indicates that the device kernel took too long to execute. This can " << std::endl
+            << "only occur if timeouts are enabled - see the device property " << std::endl
+            << "ref ::cudaDeviceProp::kernelExecTimeoutEnabled \"kernelExecTimeoutEnabled\" " << std::endl
+            << "for more information. The device cannot be used until ::cudaThreadExit() " << std::endl
+            << "is called. All existing device memory allocations are invalid and must be " << std::endl
+            << "reconstructed if the program is to continue using CUDA. " << std::endl;
+         break;
 
       // 7
       case cudaErrorLaunchOutOfResources:
          std::cerr
-          << "This indicates that a launch did not occur because it did not have " << std::endl
-          << "appropriate resources. Although this error is similar to " << std::endl
-          << "::cudaErrorInvalidConfiguration, this error usually indicates that the " << std::endl
-          << "user has attempted to pass too many arguments to the device kernel, or the " << std::endl
-          << "kernel launch specifies too many threads for the kernel's register count. " << std::endl;
-       break;
+            << "This indicates that a launch did not occur because it did not have " << std::endl
+            << "appropriate resources. Although this error is similar to " << std::endl
+            << "::cudaErrorInvalidConfiguration, this error usually indicates that the " << std::endl
+            << "user has attempted to pass too many arguments to the device kernel, or the " << std::endl
+            << "kernel launch specifies too many threads for the kernel's register count. " << std::endl;
+         break;
 
       // 8
       case cudaErrorInvalidDeviceFunction:
          std::cerr
-          << "The requested device function does not exist or is not compiled for the " << std::endl
-          << "proper device architecture. " << std::endl;
-       break;
+            << "The requested device function does not exist or is not compiled for the " << std::endl
+            << "proper device architecture. " << std::endl;
+         break;
  
       // 9
       case cudaErrorInvalidConfiguration:
          std::cerr
-          << "This indicates that a kernel launch is requesting resources that can " << std::endl
-          << "never be satisfied by the current device. Requesting more shared memory " << std::endl
-          << "per block than the device supports will trigger this error, as will " << std::endl
-          << "requesting too many threads or blocks. See ::cudaDeviceProp for more " << std::endl
-          << "device limitations. " << std::endl;
-       break;
+            << "This indicates that a kernel launch is requesting resources that can " << std::endl
+            << "never be satisfied by the current device. Requesting more shared memory " << std::endl
+            << "per block than the device supports will trigger this error, as will " << std::endl
+            << "requesting too many threads or blocks. See ::cudaDeviceProp for more " << std::endl
+            << "device limitations. " << std::endl;
+         break;
 
       // 10
       case cudaErrorInvalidDevice:
          std::cerr
-          << "This indicates that the device ordinal supplied by the user does not " << std::endl
-          << "correspond to a valid CUDA device. " << std::endl;
-       break;
+            << "This indicates that the device ordinal supplied by the user does not " << std::endl
+            << "correspond to a valid CUDA device. " << std::endl;
+         break;
 
       // 11
       case cudaErrorInvalidValue:
          std::cerr
-          << "This indicates that one or more of the parameters passed to the API call " << std::endl
-          << "is not within an acceptable range of values. " << std::endl;
-       break;
+            << "This indicates that one or more of the parameters passed to the API call " << std::endl
+            << "is not within an acceptable range of values. " << std::endl;
+         break;
 
       // 12
       case cudaErrorInvalidPitchValue:
          std::cerr
-          << "This indicates that one or more of the pitch-related parameters passed " << std::endl
-          << "to the API call is not within the acceptable range for pitch. " << std::endl;
-       break;
+            << "This indicates that one or more of the pitch-related parameters passed " << std::endl
+            << "to the API call is not within the acceptable range for pitch. " << std::endl;
+         break;
 
       // 13
       case cudaErrorInvalidSymbol:
          std::cerr
-          << "This indicates that the symbol name/identifier passed to the API call " << std::endl
-          << "is not a valid name or identifier. " << std::endl;
-       break;
+            << "This indicates that the symbol name/identifier passed to the API call " << std::endl
+            << "is not a valid name or identifier. " << std::endl;
+         break;
 
       // 14
       case cudaErrorMapBufferObjectFailed:
-      std::cerr
-       << "This indicates that the buffer object could not be mapped. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that the buffer object could not be mapped. " << std::endl;
+         break;
 
       // 15
       case cudaErrorUnmapBufferObjectFailed:
          std::cerr
-          << "This indicates that the buffer object could not be unmapped. " << std::endl;
-       break;
+            << "This indicates that the buffer object could not be unmapped. " << std::endl;
+         break;
 
       // 16
       case cudaErrorInvalidHostPointer:
          std::cerr
-          << "This indicates that at least one host pointer passed to the API call is " << std::endl
-          << "not a valid host pointer. " << std::endl;
-       break;
+            << "This indicates that at least one host pointer passed to the API call is " << std::endl
+            << "not a valid host pointer. " << std::endl;
+         break;
 
       // 17
       case cudaErrorInvalidDevicePointer:
          std::cerr
-          << "This indicates that at least one device pointer passed to the API call is " << std::endl
-          << "not a valid device pointer. " << std::endl;
-       break;
+            << "This indicates that at least one device pointer passed to the API call is " << std::endl
+            << "not a valid device pointer. " << std::endl;
+         break;
 
       case cudaErrorInvalidTexture:
          std::cerr
-          << "This indicates that the texture passed to the API call is not a valid " << std::endl
-          << "texture. " << std::endl;
-       break;
+            << "This indicates that the texture passed to the API call is not a valid " << std::endl
+            << "texture. " << std::endl;
+         break;
 
       case cudaErrorInvalidTextureBinding:
          std::cerr
-          << "This indicates that the texture binding is not valid. This occurs if you " << std::endl
-          << "call ::cudaGetTextureAlignmentOffset() with an unbound texture. " << std::endl;
-       break;
+            << "This indicates that the texture binding is not valid. This occurs if you " << std::endl
+            << "call ::cudaGetTextureAlignmentOffset() with an unbound texture. " << std::endl;
+         break;
 
       case cudaErrorInvalidChannelDescriptor:
          std::cerr
-          << "This indicates that the channel descriptor passed to the API call is not " << std::endl
-          << "valid. This occurs if the format is not one of the formats specified by " << std::endl
-          << "::cudaChannelFormatKind, or if one of the dimensions is invalid. " << std::endl;
-       break;
+            << "This indicates that the channel descriptor passed to the API call is not " << std::endl
+            << "valid. This occurs if the format is not one of the formats specified by " << std::endl
+            << "::cudaChannelFormatKind, or if one of the dimensions is invalid. " << std::endl;
+         break;
 
       case cudaErrorInvalidMemcpyDirection:
          std::cerr
-          << "This indicates that the direction of the memcpy passed to the API call is " << std::endl
-          << "not one of the types specified by ::cudaMemcpyKind. " << std::endl;
-       break;
+            << "This indicates that the direction of the memcpy passed to the API call is " << std::endl
+            << "not one of the types specified by ::cudaMemcpyKind. " << std::endl;
+         break;
 
       case cudaErrorAddressOfConstant:
          std::cerr
-          << "This indicated that the user has taken the address of a constant variable, " << std::endl
-          << "which was forbidden up until the CUDA 3.1 release. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Variables in constant " << std::endl
-          << "memory may now have their address taken by the runtime via " << std::endl
-          << "::cudaGetSymbolAddress(). " << std::endl;
-       break;
+            << "This indicated that the user has taken the address of a constant variable, " << std::endl
+            << "which was forbidden up until the CUDA 3.1 release. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Variables in constant " << std::endl
+            << "memory may now have their address taken by the runtime via " << std::endl
+            << "::cudaGetSymbolAddress(). " << std::endl;
+         break;
 
       case cudaErrorTextureFetchFailed:
          std::cerr
-          << "This indicated that a texture fetch was not able to be performed. " << std::endl
-          << "This was previously used for device emulation of texture operations. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "This indicated that a texture fetch was not able to be performed. " << std::endl
+            << "This was previously used for device emulation of texture operations. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       case cudaErrorTextureNotBound:
          std::cerr
-          << "This indicated that a texture was not bound for access. " << std::endl
-          << "This was previously used for device emulation of texture operations. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "This indicated that a texture was not bound for access. " << std::endl
+            << "This was previously used for device emulation of texture operations. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       case cudaErrorSynchronizationError:
          std::cerr
-          << "This indicated that a synchronization operation had failed. " << std::endl
-          << "This was previously used for some device emulation functions. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "This indicated that a synchronization operation had failed. " << std::endl
+            << "This was previously used for some device emulation functions. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       case cudaErrorInvalidFilterSetting:
          std::cerr
-          << "This indicates that a non-float texture was being accessed with linear " << std::endl
-          << "filtering. This is not supported by CUDA. " << std::endl;
-       break;
+            << "This indicates that a non-float texture was being accessed with linear " << std::endl
+            << "filtering. This is not supported by CUDA. " << std::endl;
+         break;
 
       case cudaErrorInvalidNormSetting:
          std::cerr
-          << "This indicates that an attempt was made to read a non-float texture as a " << std::endl
-          << "normalized float. This is not supported by CUDA. " << std::endl;
-       break;
+            << "This indicates that an attempt was made to read a non-float texture as a " << std::endl
+            << "normalized float. This is not supported by CUDA. " << std::endl;
+         break;
 
       case cudaErrorMixedDeviceExecution:
          std::cerr
-          << "Mixing of device and device emulation code was not allowed. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "Mixing of device and device emulation code was not allowed. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       case cudaErrorCudartUnloading:
          std::cerr
-          << "This indicated an issue with calling API functions during the unload " << std::endl
-          << "process of the CUDA runtime in prior releases. " << std::endl
-          << "This error return is deprecated as of CUDA 3.2. " << std::endl;
-       break;
+            << "This indicated an issue with calling API functions during the unload " << std::endl
+            << "process of the CUDA runtime in prior releases. " << std::endl
+            << "This error return is deprecated as of CUDA 3.2. " << std::endl;
+         break;
 
       case cudaErrorUnknown:
          std::cerr
-          << "This indicates that an unknown internal error has occurred. " << std::endl;
-       break;
+            << "This indicates that an unknown internal error has occurred. " << std::endl;
+         break;
 
       case cudaErrorNotYetImplemented:
          std::cerr
-          << "This indicates that the API call is not yet implemented. Production " << std::endl
-          << "releases of CUDA will never return this error. " << std::endl;
-       break;
+            << "This indicates that the API call is not yet implemented. Production " << std::endl
+            << "releases of CUDA will never return this error. " << std::endl;
+         break;
 
       case cudaErrorMemoryValueTooLarge:
          std::cerr
-          << "This indicated that an emulated device pointer exceeded the 32-bit address " << std::endl
-          << "range. " << std::endl
-          << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
-          << "removed with the CUDA 3.1 release. " << std::endl;
-       break;
+            << "This indicated that an emulated device pointer exceeded the 32-bit address " << std::endl
+            << "range. " << std::endl
+            << "This error return is deprecated as of CUDA 3.1. Device emulation mode was " << std::endl
+            << "removed with the CUDA 3.1 release. " << std::endl;
+         break;
 
       case cudaErrorInvalidResourceHandle:
          std::cerr
-          << "This indicates that a resource handle passed to the API call was not " << std::endl
-          << "valid. Resource handles are opaque types like ::cudaStream_t and " << std::endl
-          << "::cudaEvent_t. " << std::endl;
-       break;
+            << "This indicates that a resource handle passed to the API call was not " << std::endl
+            << "valid. Resource handles are opaque types like ::cudaStream_t and " << std::endl
+            << "::cudaEvent_t. " << std::endl;
+         break;
 
       case cudaErrorNotReady:
          std::cerr
-          << "This indicates that asynchronous operations issued previously have not " << std::endl
-          << "completed yet. This result is not actually an error, but must be indicated " << std::endl
-          << "differently than ::cudaSuccess (which indicates completion). Calls that " << std::endl
-          << "may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). " << std::endl;
-       break;
+            << "This indicates that asynchronous operations issued previously have not " << std::endl
+            << "completed yet. This result is not actually an error, but must be indicated " << std::endl
+            << "differently than ::cudaSuccess (which indicates completion). Calls that " << std::endl
+            << "may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). " << std::endl;
+         break;
 
       case cudaErrorInsufficientDriver:
          std::cerr
-          << "This indicates that the installed NVIDIA CUDA driver is older than the " << std::endl
-          << "CUDA runtime library. This is not a supported configuration. Users should " << std::endl
-          << "install an updated NVIDIA display driver to allow the application to run. " << std::endl;
-       break;
+            << "This indicates that the installed NVIDIA CUDA driver is older than the " << std::endl
+            << "CUDA runtime library. This is not a supported configuration. Users should " << std::endl
+            << "install an updated NVIDIA display driver to allow the application to run. " << std::endl;
+         break;
 
       case cudaErrorSetOnActiveProcess:
          std::cerr
-          << "This indicates that the user has called ::cudaSetDevice(), " << std::endl
-          << "::cudaSetValidDevices(), ::cudaSetDeviceFlags(), " << std::endl
-          << "::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice, " << std::endl
-          << "::cudaD3D11SetDirect3DDevice(), * or ::cudaVDPAUSetVDPAUDevice() after " << std::endl
-          << "initializing the CUDA runtime by calling non-device management operations " << std::endl
-          << "(allocating memory and launching kernels are examples of non-device " << std::endl
-          << "management operations). This error can also be returned if using " << std::endl
-          << "runtime/driver interoperability and there is an existing ::CUcontext " << std::endl
-          << "active on the host thread. " << std::endl;
-       break;
+            << "This indicates that the user has called ::cudaSetDevice(), " << std::endl
+            << "::cudaSetValidDevices(), ::cudaSetDeviceFlags(), " << std::endl
+            << "::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice, " << std::endl
+            << "::cudaD3D11SetDirect3DDevice(), * or ::cudaVDPAUSetVDPAUDevice() after " << std::endl
+            << "initializing the CUDA runtime by calling non-device management operations " << std::endl
+            << "(allocating memory and launching kernels are examples of non-device " << std::endl
+            << "management operations). This error can also be returned if using " << std::endl
+            << "runtime/driver interoperability and there is an existing ::CUcontext " << std::endl
+            << "active on the host thread. " << std::endl;
+         break;
 
       case cudaErrorInvalidSurface:
          std::cerr
-          << "This indicates that the surface passed to the API call is not a valid " << std::endl
-          << "surface. " << std::endl;
-       break;
+            << "This indicates that the surface passed to the API call is not a valid " << std::endl
+            << "surface. " << std::endl;
+         break;
 
       case cudaErrorNoDevice:
-      std::cerr
-       << "This indicates that no CUDA-capable devices were detected by the installed " << std::endl
-       << "CUDA driver. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that no CUDA-capable devices were detected by the installed " << std::endl
+            << "CUDA driver. " << std::endl;
+         break;
 
       case cudaErrorECCUncorrectable:
-      std::cerr
-       << "This indicates that an uncorrectable ECC error was detected during " << std::endl
-       << "execution. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that an uncorrectable ECC error was detected during " << std::endl
+            << "execution. " << std::endl;
+         break;
 
       case cudaErrorSharedObjectSymbolNotFound:
-      std::cerr
-       << "This indicates that a link to a shared object failed to resolve. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that a link to a shared object failed to resolve. " << std::endl;
+         break;
 
       case cudaErrorSharedObjectInitFailed:
-      std::cerr
-       << "This indicates that initialization of a shared object failed. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that initialization of a shared object failed. " << std::endl;
+         break;
 
       case cudaErrorUnsupportedLimit:
-      std::cerr
-       << "This indicates that the ::cudaLimit passed to the API call is not " << std::endl
-       << "supported by the active device. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that the ::cudaLimit passed to the API call is not " << std::endl
+            << "supported by the active device. " << std::endl;
+         break;
 
       case cudaErrorDuplicateVariableName:
-      std::cerr
-       << "This indicates that multiple global or constant variables (across separate " << std::endl
-       << "CUDA source files in the application) share the same string name. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that multiple global or constant variables (across separate " << std::endl
+            << "CUDA source files in the application) share the same string name. " << std::endl;
+         break;
 
       case cudaErrorDuplicateTextureName:
-      std::cerr
-       << "This indicates that multiple textures (across separate CUDA source " << std::endl
-       << "files in the application) share the same string name. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that multiple textures (across separate CUDA source " << std::endl
+            << "files in the application) share the same string name. " << std::endl;
+         break;
 
       case cudaErrorDuplicateSurfaceName:
-      std::cerr
-       << "This indicates that multiple surfaces (across separate CUDA source " << std::endl
-       << "files in the application) share the same string name. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that multiple surfaces (across separate CUDA source " << std::endl
+            << "files in the application) share the same string name. " << std::endl;
+         break;
 
       case cudaErrorDevicesUnavailable:
-      std::cerr
-       << "This indicates that all CUDA devices are busy or unavailable at the current " << std::endl
-       << "time. Devices are often busy/unavailable due to use of " << std::endl
-       << "::cudaComputeModeExclusive or ::cudaComputeModeProhibited. They can also " << std::endl
-       << "be unavailable due to memory constraints on a device that already has " << std::endl
-       << "active CUDA work being performed. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that all CUDA devices are busy or unavailable at the current " << std::endl
+            << "time. Devices are often busy/unavailable due to use of " << std::endl
+            << "::cudaComputeModeExclusive or ::cudaComputeModeProhibited. They can also " << std::endl
+            << "be unavailable due to memory constraints on a device that already has " << std::endl
+            << "active CUDA work being performed. " << std::endl;
+         break;
 
       case cudaErrorInvalidKernelImage:
-      std::cerr
-       << "This indicates that the device kernel image is invalid. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that the device kernel image is invalid. " << std::endl;
+         break;
 
       case cudaErrorNoKernelImageForDevice:
-      std::cerr
-       << "This indicates that there is no kernel image available that is suitable " << std::endl
-       << "for the device. This can occur when a user specifies code generation " << std::endl
-       << "options for a particular CUDA source file that do not include the " << std::endl
-       << "corresponding device configuration. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that there is no kernel image available that is suitable " << std::endl
+            << "for the device. This can occur when a user specifies code generation " << std::endl
+            << "options for a particular CUDA source file that do not include the " << std::endl
+            << "corresponding device configuration. " << std::endl;
+         break;
 
       case cudaErrorIncompatibleDriverContext:
-      std::cerr
-       << "This indicates that the current context is not compatible with this " << std::endl
-       << "version of the CUDA Runtime. This can only occur if you are using CUDA " << std::endl
-       << "Runtime/Driver interoperability and have created an existing Driver " << std::endl
-       << "context using an older API. Please see \ref CUDART_DRIVER " << std::endl
-       << "\"Interactions with the CUDA Driver API\" for more information. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates that the current context is not compatible with this " << std::endl
+            << "version of the CUDA Runtime. This can only occur if you are using CUDA " << std::endl
+            << "Runtime/Driver interoperability and have created an existing Driver " << std::endl
+            << "context using an older API. Please see \\ref CUDART_DRIVER " << std::endl
+            << "\"Interactions with the CUDA Driver API\" for more information. " << std::endl;
+         break;
 
       case cudaErrorStartupFailure:
-      std::cerr
-       << "This indicates an internal startup failure in the CUDA runtime. " << std::endl;
-       break;
+         std::cerr
+            << "This indicates an internal startup failure in the CUDA runtime. " << std::endl;
+         break;
 
       case cudaErrorApiFailureBase:
-      std::cerr
-       << "Any unhandled CUDA driver error is added to this value and returned via " << std::endl
-       << "the runtime. Production releases of CUDA should not return such errors. " << std::endl;
-       break;
+         std::cerr
+            << "Any unhandled CUDA driver error is added to this value and returned via " << std::endl
+            << "the runtime. Production releases of CUDA should not return such errors. " << std::endl;
+         break;
 
+      default:
+         std::cerr << "(detailed description is not available)" << std::endl;
+         break;
    }
    throw EXIT_FAILURE;
    return false;
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 301b0091634e412522a52539563b2a8c0e6b122c..23f2f342eac8993b6e0099ce4678cab6bf64b5cb 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -15,6 +15,7 @@
 #include <TNL/String.h>
 #include <TNL/Assert.h>
 #include <TNL/SmartPointersRegister.h>
+#include <TNL/Timer.h>
 
 namespace TNL {
 
@@ -31,29 +32,27 @@ namespace Devices {
 #define __cuda_callable__
 #endif
 
-
 class Cuda
 {
    public:
 
    static String getDeviceType();
 
-   __cuda_callable__ static inline int getMaxGridSize();
+   __cuda_callable__ static inline constexpr int getMaxGridSize();
 
-   __cuda_callable__ static inline int getMaxBlockSize();
+   __cuda_callable__ static inline constexpr int getMaxBlockSize();
 
-   __cuda_callable__ static inline int getWarpSize();
+   __cuda_callable__ static inline constexpr int getWarpSize();
 
-#ifdef HAVE_CUDA
-   static int getDeviceId();
-   
-   template< typename Index >
-   __device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 );
-#endif
+   __cuda_callable__ static inline constexpr int getNumberOfSharedMemoryBanks();
 
-   __cuda_callable__ static inline int getNumberOfSharedMemoryBanks();
+   static inline constexpr int getGPUTransferBufferSize();
 
-   static int getGPUTransferBufferSize();
+#ifdef HAVE_CUDA
+   __device__ static inline int
+   getGlobalThreadIdx( const int gridIdx = 0,
+                       const int gridSize = getMaxGridSize() );
+#endif
 
    static int getNumberOfBlocks( const int threads,
                                  const int blockSize );
@@ -61,8 +60,6 @@ class Cuda
    static int getNumberOfGrids( const int blocks,
                                 const int gridSize = getMaxGridSize() );
 
-   static size_t getFreeMemory();
-
    template< typename ObjectType >
    static ObjectType* passToDevice( const ObjectType& object );
 
@@ -82,6 +79,27 @@ class Cuda
 #ifdef HAVE_CUDA
    template< typename Index >
    static __device__ Index getInterleaving( const Index index );
+
+   /****
+    * Declaration of variables for dynamic shared memory is difficult in
+    * templated functions. For example, the following does not work for
+    * different types T:
+    *
+    *    template< typename T >
+    *    void foo()
+    *    {
+    *        extern __shared__ T shx[];
+    *    }
+    *
+    * This is because extern variables must be declared exactly once. In
+    * templated functions we need to have same variable name with different
+    * type, which causes the conflict. In CUDA samples they solve the problem
+    * using template specialization via classes, but using one base type and
+    * reinterpret_cast works too.
+    * See http://stackoverflow.com/a/19339004/4180822 for reference.
+    */
+   template< typename Element, size_t Alignment = sizeof( Element ) >
+   static __device__ Element* getSharedMemory();
 #endif
 
 #ifdef HAVE_CUDA
@@ -92,7 +110,7 @@ class Cuda
     */
    static bool checkDevice( const char* file_name, int line, cudaError error );
 #else
-   static bool checkDevice() { return false;};
+   static bool checkDevice() { return false; };
 #endif
    
    static void configSetup( Config::ConfigDescription& config, const String& prefix = "" );
@@ -108,10 +126,11 @@ class Cuda
    // called to get the device ID.
    static bool synchronizeDevice( int deviceId = -1 );
    
+   static Timer smartPointersSynchronizationTimer;
+   
    protected:
    
    static SmartPointersRegister smartPointersRegister;
-
 };
 
 #ifdef HAVE_CUDA
@@ -123,29 +142,6 @@ class Cuda
 #define CudaSupportMissingMessage \
    std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;
 
-
-// TODO: This would be nice in Cuda but C++ standard does not allow it.
-#ifdef HAVE_CUDA
-   template< typename Element >
-   struct getSharedMemory
-   {
-       __device__ operator Element*();
-   };
-
-   template<>
-   struct getSharedMemory< double >
-   {
-       inline __device__ operator double*();
-   };
-
-   template<>
-   struct getSharedMemory< long int >
-   {
-       inline __device__ operator long int*();
-   };
-
-#endif
-
 } // namespace Devices
 } // namespace TNL   
    
diff --git a/src/TNL/Devices/CudaDeviceInfo.cpp b/src/TNL/Devices/CudaDeviceInfo.cpp
index 93061f8bd5ff1817c02f35b654de78f671f8c009..45199ecda1fa986dc68d0fd5386b06c09e8b7ecc 100644
--- a/src/TNL/Devices/CudaDeviceInfo.cpp
+++ b/src/TNL/Devices/CudaDeviceInfo.cpp
@@ -11,6 +11,7 @@
 #ifndef HAVE_CUDA
 
 #include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Logger.h>
 
 namespace TNL {
 namespace Devices {   
@@ -64,6 +65,13 @@ getGlobalMemory( int deviceNum )
    return 0;
 }
 
+size_t
+CudaDeviceInfo::
+getFreeGlobalMemory()
+{
+   return 0;
+}
+
 int
 CudaDeviceInfo::
 getMemoryClockRate( int deviceNum )
@@ -99,6 +107,12 @@ getCudaCores( int deviceNum )
    return 0;
 }
 
+void
+CudaDeviceInfo::
+writeDeviceInfo( Logger& logger )
+{
+}
+
 } // namespace Devices
 } // namespace TNL
 
diff --git a/src/TNL/Devices/CudaDeviceInfo.cu b/src/TNL/Devices/CudaDeviceInfo.cu
index 590347580373e26365ed110d8a83ed76d31b154f..84096561dcb11de412828b07e43403132af493f4 100644
--- a/src/TNL/Devices/CudaDeviceInfo.cu
+++ b/src/TNL/Devices/CudaDeviceInfo.cu
@@ -10,8 +10,10 @@
 
 #ifdef HAVE_CUDA
 
+#include <unordered_map>
+
 #include <TNL/Devices/CudaDeviceInfo.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Logger.h>
 
 namespace TNL {
 namespace Devices {
@@ -79,6 +81,16 @@ getGlobalMemory( int deviceNum )
     return properties.totalGlobalMem;
 }
 
+size_t
+CudaDeviceInfo::
+getFreeGlobalMemory()
+{
+   size_t free = 0;
+   size_t total = 0;
+   cudaMemGetInfo( &free, &total );
+   return free;
+}
+
 int
 CudaDeviceInfo::
 getMemoryClockRate( int deviceNum )
@@ -101,9 +113,15 @@ int
 CudaDeviceInfo::
 getCudaMultiprocessors( int deviceNum )
 {
-    cudaDeviceProp properties;
-    cudaGetDeviceProperties( &properties, deviceNum );
-    return properties.multiProcessorCount;
+    // results are cached because they are used for configuration of some kernels
+    static std::unordered_map< int, int > results;
+    if( results.count( deviceNum ) == 0 ) {
+        cudaDeviceProp properties;
+        cudaGetDeviceProperties( &properties, deviceNum );
+        results.emplace( deviceNum, properties.multiProcessorCount );
+        return properties.multiProcessorCount;
+    }
+    return results[ deviceNum ];
 }
 
 int
@@ -141,6 +159,35 @@ getCudaCores( int deviceNum )
            CudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
 }
 
+void
+CudaDeviceInfo::
+writeDeviceInfo( Logger& logger )
+{
+   logger.writeParameter< String >( "CUDA GPU info", String("") );
+   // TODO: Printing all devices does not make sense until TNL can actually
+   //       use more than one device for computations. Printing only the active
+   //       device for now...
+//   int devices = getNumberOfDevices();
+//   writeParameter< int >( "Number of devices", devices, 1 );
+//   for( int i = 0; i < devices; i++ )
+//   {
+//      logger.writeParameter< int >( "Device no.", i, 1 );
+      int i = getActiveDevice();
+      logger.writeParameter< String >( "Name", getDeviceName( i ), 2 );
+      String deviceArch = String( getArchitectureMajor( i ) ) + "." +
+                              String( getArchitectureMinor( i ) );
+      logger.writeParameter< String >( "Architecture", deviceArch, 2 );
+      logger.writeParameter< int >( "CUDA cores", getCudaCores( i ), 2 );
+      double clockRate = ( double ) getClockRate( i ) / 1.0e3;
+      logger.writeParameter< double >( "Clock rate (in MHz)", clockRate, 2 );
+      double globalMemory = ( double ) getGlobalMemory( i ) / 1.0e9;
+      logger.writeParameter< double >( "Global memory (in GB)", globalMemory, 2 );
+      double memoryClockRate = ( double ) getMemoryClockRate( i ) / 1.0e3;
+      logger.writeParameter< double >( "Memory clock rate (in Mhz)", memoryClockRate, 2 );
+      logger.writeParameter< bool >( "ECC enabled", getECCEnabled( i ), 2 );
+//   }
+}
+
 } // namespace Devices
 } // namespace TNL
 
diff --git a/src/TNL/Devices/CudaDeviceInfo.h b/src/TNL/Devices/CudaDeviceInfo.h
index 3bc7475727faee79b016a4d881aee36cc36b8973..b658e917703f8d97a7caca76c8055a845670506d 100644
--- a/src/TNL/Devices/CudaDeviceInfo.h
+++ b/src/TNL/Devices/CudaDeviceInfo.h
@@ -11,9 +11,13 @@
 #pragma once
 
 #include <stdlib.h>
-#include <TNL/Devices/Cuda.h>
+
+#include <TNL/String.h>
 
 namespace TNL {
+
+class Logger;
+
 namespace Devices {
 
 class CudaDeviceInfo
@@ -34,6 +38,8 @@ class CudaDeviceInfo
 
       static size_t getGlobalMemory( int deviceNum );
 
+      static size_t getFreeGlobalMemory();
+
       static int getMemoryClockRate( int deviceNum );
 
       static bool getECCEnabled( int deviceNum );
@@ -44,8 +50,8 @@ class CudaDeviceInfo
 
       static int getCudaCores( int deviceNum );
 
+      static void writeDeviceInfo( Logger& logger );
 };
 
 } // namespace Devices
 } // namespace TNL
-
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index 9c0d252a5a9e59458ca6cb915e4543dba6cfddbe..ad5198d7a767606d0acae453f864720838882ee5 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -16,41 +16,45 @@ namespace TNL {
 namespace Devices {   
 
 __cuda_callable__ 
-inline int Cuda::getMaxGridSize()
+inline constexpr int Cuda::getMaxGridSize()
 {
    // TODO: make it preprocessor macro constant defined in tnlConfig
    return 65535;
 };
 
 __cuda_callable__
-inline int Cuda::getMaxBlockSize()
+inline constexpr int Cuda::getMaxBlockSize()
 {
    // TODO: make it preprocessor macro constant defined in tnlConfig
    return 1024;
 };
 
 __cuda_callable__ 
-inline int Cuda::getWarpSize()
+inline constexpr int Cuda::getWarpSize()
 {
    // TODO: make it preprocessor macro constant defined in tnlConfig
    return 32;
 }
 
-#ifdef HAVE_CUDA
-template< typename Index >
-__device__ Index Cuda::getGlobalThreadIdx( const Index gridIdx )
+__cuda_callable__
+inline constexpr int Cuda::getNumberOfSharedMemoryBanks()
 {
-   return ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   // TODO: make it preprocessor macro constant defined in tnlConfig
+   return 32;
 }
-#endif
 
+inline constexpr int Cuda::getGPUTransferBufferSize()
+{
+   return 1 << 20;
+}
 
-__cuda_callable__ 
-inline int Cuda::getNumberOfSharedMemoryBanks()
+#ifdef HAVE_CUDA
+__device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gridSize )
 {
-   // TODO: make it preprocessor macro constant defined in tnlConfig
-   return 32;
+   return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
 }
+#endif
+
 
 template< typename ObjectType >
 ObjectType* Cuda::passToDevice( const ObjectType& object )
@@ -74,7 +78,7 @@ ObjectType* Cuda::passToDevice( const ObjectType& object )
    }
    return deviceObject;
 #else
-   Assert( false, std::cerr << "CUDA support is missing." );
+   TNL_ASSERT( false, std::cerr << "CUDA support is missing." );
    return 0;
 #endif
 }
@@ -91,7 +95,7 @@ ObjectType Cuda::passFromDevice( const ObjectType* object )
    checkCudaDevice;
    return aux;
 #else
-   Assert( false, std::cerr << "CUDA support is missing." );
+   TNL_ASSERT( false, std::cerr << "CUDA support is missing." );
    return 0;
 #endif
 }
@@ -107,7 +111,7 @@ void Cuda::passFromDevice( const ObjectType* deviceObject,
                cudaMemcpyDeviceToHost );
    checkCudaDevice;
 #else
-   Assert( false, std::cerr << "CUDA support is missing." );
+   TNL_ASSERT( false, std::cerr << "CUDA support is missing." );
 #endif
 }
 
@@ -129,7 +133,7 @@ void Cuda::freeFromDevice( ObjectType* deviceObject )
    cudaFree( ( void* ) deviceObject );
    checkCudaDevice;
 #else
-   Assert( false, std::cerr << "CUDA support is missing." );
+   TNL_ASSERT( false, std::cerr << "CUDA support is missing." );
 #endif
 }
 
@@ -140,25 +144,12 @@ __device__ Index Cuda::getInterleaving( const Index index )
    return index + index / Cuda::getNumberOfSharedMemoryBanks();
 }
 
-template< typename Element >
-__device__ getSharedMemory< Element >::operator Element*()
-{
-   extern __shared__ int __sharedMemory[];
-   return ( Element* ) __sharedMemory;
-};
-
-__device__ inline getSharedMemory< double >::operator double*()
+template< typename Element, size_t Alignment >
+__device__ Element* Cuda::getSharedMemory()
 {
-   extern __shared__ double __sharedMemoryDouble[];
-   return ( double* ) __sharedMemoryDouble;
-};
-
-__device__ inline getSharedMemory< long int >::operator long int*()
-{
-   extern __shared__ long int __sharedMemoryLongInt[];
-   return ( long int* ) __sharedMemoryLongInt;
-};
-
+   extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
+   return reinterpret_cast< Element* >( __sdata );
+}
 #endif /* HAVE_CUDA */
 
 } // namespace Devices
diff --git a/src/TNL/Devices/Host.cpp b/src/TNL/Devices/Host.cpp
index f46678c2d78219bceb01a1836b53d240f66c40dc..db56392bcb6ad8218e7f3a45f3d1173eae541d94 100644
--- a/src/TNL/Devices/Host.cpp
+++ b/src/TNL/Devices/Host.cpp
@@ -8,16 +8,31 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include <TNL/Devices/Host.h>
+#include <set>
+#include <iomanip>
+#include <cstring>
+#include <ctime>
+
+#include <sys/utsname.h>
+#include <sys/stat.h>
+
 #ifdef HAVE_OPENMP
 #include <omp.h>
 #endif
+
+#include <TNL/tnlConfig.h>
+#include <TNL/Devices/Host.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
+#include <TNL/Logger.h>
 
 namespace TNL {
 namespace Devices {   
 
+int Host::numberOfProcessors( 0 );
+String Host::CPUModelName( "" );
+int Host::CPUThreads( 0 );
+int Host::CPUCores( 0 );
 bool Host::ompEnabled( true );
 int Host::maxThreadsCount( -1 );
 
@@ -26,6 +41,209 @@ String Host::getDeviceType()
    return String( "Devices::Host" );
 };
 
+
+String
+Host::getHostname( void )
+{
+   char host_name[ 256 ];
+   gethostname( host_name, 255 );
+   return String( host_name );
+}
+
+String
+Host::getArchitecture( void )
+{
+   utsname uts;
+   uname( &uts );
+   return String( uts.machine );
+}
+
+String
+Host::getSystemName( void )
+{
+   utsname uts;
+   uname( &uts );
+   return String( uts.sysname );
+}
+
+String
+Host::getSystemRelease( void )
+{
+   utsname uts;
+   uname( &uts );
+   return String( uts.release );
+}
+
+String
+Host::getCurrentTime( const char* format )
+{
+   const std::time_t time_since_epoch = std::time( nullptr );
+   std::tm* localtime = std::localtime( &time_since_epoch );
+   // TODO: use std::put_time in the future (available since GCC 5)
+//   std::stringstream ss;
+//   ss << std::put_time( localtime, format );
+//   return String( ss.str().c_str() );
+   char buffer[1024];
+   std::strftime( buffer, 1024, format, localtime );
+   return String( buffer );
+}
+
+
+int
+Host::getNumberOfProcessors( void )
+{
+   if( numberOfProcessors == 0 )
+      parseCPUInfo();
+   return numberOfProcessors;
+}
+
+String
+Host::getOnlineCPUs( void )
+{
+   std::string online = readFile< std::string >( "/sys/devices/system/cpu/online" );
+   return String( online.c_str() );
+}
+
+int
+Host::getNumberOfCores( int cpu_id )
+{
+   if( CPUCores == 0 )
+      parseCPUInfo();
+   return CPUCores;
+}
+
+int
+Host::getNumberOfThreads( int cpu_id )
+{
+   if( CPUThreads == 0 )
+      parseCPUInfo();
+   return CPUThreads;
+}
+
+String
+Host::getCPUModelName( int cpu_id )
+{
+   if( CPUModelName == "" )
+      parseCPUInfo();
+   return CPUModelName;
+}
+
+int
+Host::getCPUMaxFrequency( int cpu_id )
+{
+   String fileName( "/sys/devices/system/cpu/cpu" );
+   fileName += String( cpu_id ) + "/cpufreq/cpuinfo_max_freq";
+   return readFile< int >( fileName );
+}
+
+CacheSizes
+Host::getCPUCacheSizes( int cpu_id )
+{
+   String directory( "/sys/devices/system/cpu/cpu" );
+   directory += String( cpu_id ) + "/cache";
+
+   CacheSizes sizes;
+   for( int i = 0; i <= 3; i++ ) {
+      const String cache = directory + "/index" + String( i );
+
+      // check if the directory exists
+      struct stat st;
+      if( stat( cache.getString(), &st ) != 0 || ! S_ISDIR( st.st_mode ) )
+         break;
+
+      const int level = readFile< int >( cache + "/level" );
+      const std::string type = readFile< std::string >( cache + "/type" );
+      const int size = readFile< int >( cache + "/size" );
+
+      if( level == 1 && type == "Instruction" )
+         sizes.L1instruction = size;
+      else if( level == 1 && type == "Data" )
+         sizes.L1data = size;
+      else if( level == 2 )
+         sizes.L2 = size;
+      else if( level == 3 )
+         sizes.L3 = size;
+   }
+   return sizes;
+}
+
+void
+Host::
+writeDeviceInfo( Logger& logger )
+{
+   logger.writeParameter< String >( "Host name:", getHostname() );
+   logger.writeParameter< String >( "System:", getSystemName() );
+   logger.writeParameter< String >( "Release:", getSystemRelease() );
+   logger.writeParameter< String >( "Architecture:", getArchitecture() );
+   logger.writeParameter< char* >( "TNL Compiler:", ( char* ) TNL_CPP_COMPILER_NAME );
+   // FIXME: generalize for multi-socket systems, here we consider only the first found CPU
+   const int cpu_id = 0;
+   const int threads = getNumberOfThreads( cpu_id );
+   const int cores = getNumberOfCores( cpu_id );
+   int threadsPerCore = 0;
+   if( cores > 0 )
+      threadsPerCore = threads / cores;
+   logger.writeParameter< String >( "CPU info", String("") );
+   logger.writeParameter< String >( "Model name:", getCPUModelName( cpu_id ), 1 );
+   logger.writeParameter< int >( "Cores:", cores, 1 );
+   logger.writeParameter< int >( "Threads per core:", threadsPerCore, 1 );
+   logger.writeParameter< String >( "Max clock rate (in MHz):", getCPUMaxFrequency( cpu_id ) / 1000, 1 );
+   CacheSizes cacheSizes = getCPUCacheSizes( cpu_id );
+   String cacheInfo = String( cacheSizes.L1data ) + ", "
+                       + String( cacheSizes.L1instruction ) + ", "
+                       + String( cacheSizes.L2 ) + ", "
+                       + String( cacheSizes.L3 );
+   logger.writeParameter< String >( "Cache (L1d, L1i, L2, L3):", cacheInfo, 1 );
+}
+
+void
+Host::parseCPUInfo( void )
+{
+   std::ifstream file( "/proc/cpuinfo" );
+   if( ! file ) {
+      std::cerr << "Unable to read information from /proc/cpuinfo." << std::endl;
+      return;
+   }
+
+   char line[ 1024 ];
+   std::set< int > processors;
+   while( ! file.eof() )
+   {
+      int i;
+      file.getline( line, 1024 );
+      if( strncmp( line, "physical id", strlen( "physical id" ) ) == 0 )
+      {
+         i = strlen( "physical id" );
+         while( line[ i ] != ':' && line[ i ] ) i ++;
+         processors.insert( atoi( &line[ i + 1 ] ) );
+         continue;
+      }
+      // FIXME: the rest does not work on heterogeneous multi-socket systems
+      if( strncmp( line, "model name", strlen( "model name" ) ) == 0 )
+      {
+         i = strlen( "model name" );
+         while( line[ i ] != ':' && line[ i ] ) i ++;
+         CPUModelName.setString( &line[ i + 1 ] );
+         continue;
+      }
+      if( strncmp( line, "cpu cores", strlen( "cpu cores" ) ) == 0 )
+      {
+         i = strlen( "cpu MHz" );
+         while( line[ i ] != ':' && line[ i ] ) i ++;
+         CPUCores = atoi( &line[ i + 1 ] );
+         continue;
+      }
+      if( strncmp( line, "siblings", strlen( "siblings" ) ) == 0 )
+      {
+         i = strlen( "siblings" );
+         while( line[ i ] != ':' && line[ i ] ) i ++;
+         CPUThreads = atoi( &line[ i + 1 ] );
+      }
+   }
+   numberOfProcessors = processors.size();
+}
+
+
 size_t Host::getFreeMemory()
 {
    long pages = sysconf(_SC_PHYS_PAGES);
diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h
index bf41998ffc8ad761a7a707e12b5f64347841bc21..6c5c8a869321e24c4c4d19fc66ef4d7aea72b345 100644
--- a/src/TNL/Devices/Host.h
+++ b/src/TNL/Devices/Host.h
@@ -10,7 +10,8 @@
 
 #pragma once 
 
-#include <unistd.h>
+#include <fstream>
+
 #include <TNL/String.h>
 
 namespace TNL {
@@ -20,14 +21,39 @@ namespace Config {
    class ParameterContainer;
 }
 
+class Logger;
+
 namespace Devices {
 
+struct CacheSizes {
+   int L1instruction = 0;
+   int L1data = 0;
+   int L2 = 0;
+   int L3 = 0;
+};
+
 class Host
 {
    public:
 
       static String getDeviceType();
 
+      static String getHostname( void );
+      static String getArchitecture( void );
+      static String getSystemName( void );
+      static String getSystemRelease( void );
+      static String getCurrentTime( const char* format = "%a %b %d %Y, %H:%M:%S" );
+
+      static int    getNumberOfProcessors( void );
+      static String getOnlineCPUs( void );
+      static int    getNumberOfCores( int cpu_id );
+      static int    getNumberOfThreads( int cpu_id );
+      static String getCPUModelName( int cpu_id );
+      static int    getCPUMaxFrequency( int cpu_id );
+      static CacheSizes getCPUCacheSizes( int cpu_id );
+
+      static void writeDeviceInfo( Logger& logger );
+
       static size_t getFreeMemory();
  
       static void disableOMP();
@@ -58,12 +84,31 @@ class Host
                          const String& prefix = "" );
 
    protected:
+
+      static int numberOfProcessors;
+      static String CPUModelName;
+      static int CPUThreads;
+      static int CPUCores;
+   
+      static void parseCPUInfo( void );
+
+      template< typename ResultType >
+      static ResultType
+      readFile( const String & fileName )
+      {
+         std::ifstream file( fileName.getString() );
+         if( ! file ) {
+            std::cerr << "Unable to read information from " << fileName << "." << std::endl;
+            return 0;
+         }
+         ResultType result;
+         file >> result;
+         return result;
+      }
  
       static bool ompEnabled;
  
       static int maxThreadsCount;
-
-
 };
 
 } // namespace Devices
diff --git a/src/TNL/Experimental/Multimaps/EllpackIndexMultimap_impl.h b/src/TNL/Experimental/Multimaps/EllpackIndexMultimap_impl.h
index 8a24c06cb247e86827290da94e26f60d8a27e6d0..1b7c41a937f3fd5b9cd92c600364f09d5b4f28e3 100644
--- a/src/TNL/Experimental/Multimaps/EllpackIndexMultimap_impl.h
+++ b/src/TNL/Experimental/Multimaps/EllpackIndexMultimap_impl.h
@@ -76,12 +76,12 @@ void
 EllpackIndexMultimap< Index, Device >::
 allocate( const ValuesAllocationVectorType& portsCount )
 {
-   Assert( portsCount.getSize() == this->keysRange,
+   TNL_ASSERT( portsCount.getSize() == this->keysRange,
               std::cerr << "portsCount.getSize() =  " << portsCount.getSize()
                    << "this->inputs = " << this->keysRange );
    this->valuesMaxCount = portsCount.max();
  
-   Assert( this->valuesMaxCount >= 0 && this->valuesMaxCount <= this->valuesRange,
+   TNL_ASSERT( this->valuesMaxCount >= 0 && this->valuesMaxCount <= this->valuesRange,
               std::cerr << "this->portsMaxCount = " << this->valuesMaxCount
                    << " this->outputs = " << this->valuesRange );
    this->values.setSize( this->keysRange * this->valuesMaxCount );
diff --git a/src/TNL/File_impl.h b/src/TNL/File_impl.h
index 3342c4b5e168096742b51c789aad5bf91a08e036..a6b746f4880f7ebea1e52aee689c7d4b6fd21fcb 100644
--- a/src/TNL/File_impl.h
+++ b/src/TNL/File_impl.h
@@ -29,7 +29,7 @@ template< typename Type, typename Device, typename Index >
 bool File :: read( Type* buffer,
                    const Index& _elements )
 {
-   Assert( _elements >= 0,
+   TNL_ASSERT( _elements >= 0,
            std::cerr << " elements = " << _elements << std::endl; );
 
    // convert _elements from Index to size_t, which is *unsigned* type
@@ -126,7 +126,7 @@ template< class Type, typename Device, typename Index >
 bool File :: write( const Type* buffer,
                     const Index _elements )
 {
-   Assert( _elements >= 0,
+   TNL_ASSERT( _elements >= 0,
            std::cerr << " elements = " << _elements << std::endl; );
 
    // convert _elements from Index to size_t, which is *unsigned* type
diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h
index f79ae2ed88f8542f76aef01a4dd42692ff790863..a3cf8a46c307312ff0e10127da53339c48ddb3e4 100644
--- a/src/TNL/Functions/MeshFunction.h
+++ b/src/TNL/Functions/MeshFunction.h
@@ -165,5 +165,3 @@ class MeshFunction :
 } // namespace TNL
 
 #include <TNL/Functions/MeshFunction_impl.h>
-#include <TNL/Functions/MeshFunctionGnuplotWriter_impl.h>
-#include <TNL/Functions/MeshFunctionVTKWriter_impl.h>
diff --git a/src/TNL/Functions/MeshFunctionGnuplotWriter.h b/src/TNL/Functions/MeshFunctionGnuplotWriter.h
index 243577efcb0f45c22a6a15b941a8d4d0e9f4045a..887498711e8f3e4422a7cc99994cfecc2301504d 100644
--- a/src/TNL/Functions/MeshFunctionGnuplotWriter.h
+++ b/src/TNL/Functions/MeshFunctionGnuplotWriter.h
@@ -13,7 +13,7 @@
 #include <TNL/Meshes/Grid.h>
 
 namespace TNL {
-namespace Functions {   
+namespace Functions {
 
 template< typename, int, typename > class MeshFunction;
 
@@ -62,6 +62,7 @@ class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 1, MeshReal, Device
                          std::ostream& str );
 };
 
+
 /***
  * 2D grids cells
  */
@@ -98,7 +99,6 @@ class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 2, MeshReal, Device
                          std::ostream& str );
 };
 
-
 /***
  * 2D grids vertices
  */
@@ -117,6 +117,62 @@ class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 2, MeshReal, Device
                          std::ostream& str );
 };
 
+
+/***
+ * 3D grids cells
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3, Real > >
+{
+   public:
+      typedef Meshes::Grid< 3, MeshReal, Device, MeshIndex > MeshType;
+      typedef Real RealType;
+      typedef Functions::MeshFunction< MeshType, 3, RealType > MeshFunctionType;
+
+      static bool write( const MeshFunctionType& function,
+                         std::ostream& str );
+};
+
+/***
+ * 3D grids faces
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2, Real > >
+{
+   public:
+      typedef Meshes::Grid< 3, MeshReal, Device, MeshIndex > MeshType;
+      typedef Real RealType;
+      typedef Functions::MeshFunction< MeshType, 2, RealType > MeshFunctionType;
+
+      static bool write( const MeshFunctionType& function,
+                         std::ostream& str );
+};
+
+/***
+ * 3D grids vertices
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+class MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0, Real > >
+{
+   public:
+      typedef Meshes::Grid< 3, MeshReal, Device, MeshIndex > MeshType;
+      typedef Real RealType;
+      typedef Functions::MeshFunction< MeshType, 0, RealType > MeshFunctionType;
+
+      static bool write( const MeshFunctionType& function,
+                         std::ostream& str );
+};
+
 } // namespace Functions
 } // namespace TNL
 
+#include <TNL/Functions/MeshFunctionGnuplotWriter_impl.h>
diff --git a/src/TNL/Functions/MeshFunctionGnuplotWriter_impl.h b/src/TNL/Functions/MeshFunctionGnuplotWriter_impl.h
index a870b86425c3b26f513d1fd1d6e94a89c36bba68..341ed711a4cb4da60f3f6501a5dec0f8b736925a 100644
--- a/src/TNL/Functions/MeshFunctionGnuplotWriter_impl.h
+++ b/src/TNL/Functions/MeshFunctionGnuplotWriter_impl.h
@@ -10,8 +10,10 @@
 
 #pragma once
 
+#include <TNL/Functions/MeshFunctionGnuplotWriter.h>
+
 namespace TNL {
-namespace Functions {    
+namespace Functions {
 
 template< typename MeshFunction >
 bool
@@ -43,7 +45,7 @@ write( const MeshFunctionType& function,
    {
       entity.refresh();
       typename MeshType::VertexType v = entity.getCenter();
-      str << v << " "
+      str << v.x() << " "
           << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
    return true;
@@ -69,7 +71,7 @@ write( const MeshFunctionType& function,
    {
       entity.refresh();
       typename MeshType::VertexType v = entity.getCenter();
-      str << v << " "
+      str << v.x() << " "
           << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
    return true;
@@ -124,7 +126,7 @@ write( const MeshFunctionType& function,
    typedef typename MeshType::Face EntityType;
    typedef typename EntityType::EntityOrientationType EntityOrientation;
    EntityType entity( mesh );
- 
+
    entity.setOrientation( EntityOrientation( 1.0, 0.0 ) );
    for( entity.getCoordinates().y() = 0;
         entity.getCoordinates().y() < mesh.getDimensions().y();
@@ -141,7 +143,7 @@ write( const MeshFunctionType& function,
       }
       str << std::endl;
    }
- 
+
    entity.setOrientation( EntityOrientation( 0.0, 1.0 ) );
          for( entity.getCoordinates().x() = 0;
            entity.getCoordinates().x() < mesh.getDimensions().x();
@@ -196,6 +198,157 @@ write( const MeshFunctionType& function,
    return true;
 }
 
+
+/****
+ * 3D grid, cells
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+bool
+MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3, Real > >::
+write( const MeshFunctionType& function,
+       std::ostream& str )
+{
+   const MeshType& mesh = function.getMesh();
+   typename MeshType::Cell entity( mesh );
+   for( entity.getCoordinates().z() = 0;
+        entity.getCoordinates().z() < mesh.getDimensions().z();
+        entity.getCoordinates().z() ++ )
+      for( entity.getCoordinates().y() = 0;
+           entity.getCoordinates().y() < mesh.getDimensions().y();
+           entity.getCoordinates().y() ++ )
+      {
+         for( entity.getCoordinates().x() = 0;
+              entity.getCoordinates().x() < mesh.getDimensions().x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            typename MeshType::VertexType v = entity.getCenter();
+            str << v.x() << " " << v.y() << " " << v.z() << " "
+                << function.getData().getElement( entity.getIndex() ) << std::endl;
+         }
+         str << std::endl;
+      }
+   return true;
+}
+
+/****
+ * 3D grid, faces
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+bool
+MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2, Real > >::
+write( const MeshFunctionType& function,
+       std::ostream& str )
+{
+   const MeshType& mesh = function.getMesh();
+   typedef typename MeshType::Face EntityType;
+   typedef typename EntityType::EntityOrientationType EntityOrientation;
+   EntityType entity( mesh );
+
+   entity.setOrientation( EntityOrientation( 1.0, 0.0, 0.0 ) );
+   for( entity.getCoordinates().z() = 0;
+        entity.getCoordinates().z() < mesh.getDimensions().z();
+        entity.getCoordinates().z() ++ )
+      for( entity.getCoordinates().y() = 0;
+           entity.getCoordinates().y() < mesh.getDimensions().y();
+           entity.getCoordinates().y() ++ )
+      {
+         for( entity.getCoordinates().x() = 0;
+              entity.getCoordinates().x() <= mesh.getDimensions().x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            typename MeshType::VertexType v = entity.getCenter();
+            str << v.x() << " " << v.y() << " " << v.z() << " "
+                << function.getData().getElement( entity.getIndex() ) << std::endl;
+         }
+         str << std::endl;
+      }
+
+   entity.setOrientation( EntityOrientation( 0.0, 1.0, 0.0 ) );
+   for( entity.getCoordinates().z() = 0;
+        entity.getCoordinates().z() < mesh.getDimensions().z();
+        entity.getCoordinates().z() ++ )
+      for( entity.getCoordinates().x() = 0;
+           entity.getCoordinates().x() < mesh.getDimensions().x();
+           entity.getCoordinates().x() ++ )
+      {
+         for( entity.getCoordinates().y() = 0;
+              entity.getCoordinates().y() <= mesh.getDimensions().y();
+              entity.getCoordinates().y() ++ )
+         {
+            entity.refresh();
+            typename MeshType::VertexType v = entity.getCenter();
+            str << v.x() << " " << v.y() << " " << v.z() << " "
+                << function.getData().getElement( entity.getIndex() ) << std::endl;
+         }
+         str << std::endl;
+      }
+
+   entity.setOrientation( EntityOrientation( 0.0, 0.0, 1.0 ) );
+   for( entity.getCoordinates().x() = 0;
+        entity.getCoordinates().x() < mesh.getDimensions().x();
+        entity.getCoordinates().x() ++ )
+      for( entity.getCoordinates().y() = 0;
+           entity.getCoordinates().y() <= mesh.getDimensions().y();
+           entity.getCoordinates().y() ++ )
+      {
+         for( entity.getCoordinates().z() = 0;
+              entity.getCoordinates().z() < mesh.getDimensions().z();
+              entity.getCoordinates().z() ++ )
+         {
+            entity.refresh();
+            typename MeshType::VertexType v = entity.getCenter();
+            str << v.x() << " " << v.y() << " " << v.z() << " "
+                << function.getData().getElement( entity.getIndex() ) << std::endl;
+         }
+         str << std::endl;
+      }
+   return true;
+}
+
+
+/****
+ * 3D grid, vertices
+ */
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real >
+bool
+MeshFunctionGnuplotWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0, Real > >::
+write( const MeshFunctionType& function,
+       std::ostream& str )
+{
+   const MeshType& mesh = function.getMesh();
+   typename MeshType::Vertex entity( mesh );
+   for( entity.getCoordinates().z() = 0;
+        entity.getCoordinates().z() <= mesh.getDimensions().z();
+        entity.getCoordinates().z() ++ )
+      for( entity.getCoordinates().y() = 0;
+           entity.getCoordinates().y() <= mesh.getDimensions().y();
+           entity.getCoordinates().y() ++ )
+      {
+         for( entity.getCoordinates().x() = 0;
+              entity.getCoordinates().x() <= mesh.getDimensions().x();
+              entity.getCoordinates().x() ++ )
+         {
+            entity.refresh();
+            typename MeshType::VertexType v = entity.getCenter();
+            str << v.x() << " " << v.y() << " " << v.z() << " "
+                << function.getData().getElement( entity.getIndex() ) << std::endl;
+         }
+         str << std::endl;
+      }
+   return true;
+}
+
 } // namespace Functions
 } // namespace TNL
 
diff --git a/src/TNL/Functions/MeshFunctionNormGetter.h b/src/TNL/Functions/MeshFunctionNormGetter.h
index dd696b579c9677fb60127023e354f6b3e91d3acf..8f09c90e3f5b36b63de4e26f71f5993c1819432a 100644
--- a/src/TNL/Functions/MeshFunctionNormGetter.h
+++ b/src/TNL/Functions/MeshFunctionNormGetter.h
@@ -134,7 +134,7 @@ class MeshFunctionNormGetter< MeshFunction< Meshes::Grid< Dimensions, MeshReal,
          }
          if( EntityDimensions > 0 )
          {
-            Assert( false, std::cerr << "Not implemented yet." << std::endl );
+            TNL_ASSERT( false, std::cerr << "Not implemented yet." << std::endl );
          }
  
          if( p == 1.0 )
diff --git a/src/TNL/Functions/MeshFunctionVTKWriter.h b/src/TNL/Functions/MeshFunctionVTKWriter.h
index 239b62f3795b2b373f4725aab699516080778763..39651c27450c407639c1278f62220b82f51f240b 100644
--- a/src/TNL/Functions/MeshFunctionVTKWriter.h
+++ b/src/TNL/Functions/MeshFunctionVTKWriter.h
@@ -10,9 +10,13 @@
 
 #pragma once
 
+#include <TNL/Meshes/Grid.h>
+
 namespace TNL {
 namespace Functions {   
 
+template< typename, int, typename > class MeshFunction;
+
 template< typename MeshFunction >
 class MeshFunctionVTKWriter
 {
@@ -207,3 +211,4 @@ class MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, Me
 } // namespace Functions
 } // namespace TNL
 
+#include <TNL/Functions/MeshFunctionVTKWriter_impl.h>
diff --git a/src/TNL/Functions/MeshFunctionVTKWriter_impl.h b/src/TNL/Functions/MeshFunctionVTKWriter_impl.h
index 0493b0a421c973c110a661364ce4aa9814fc30c8..b432671764c0545c64e5c8ea241c820525f20f05 100644
--- a/src/TNL/Functions/MeshFunctionVTKWriter_impl.h
+++ b/src/TNL/Functions/MeshFunctionVTKWriter_impl.h
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include <TNL/Functions/MeshFunctionVTKWriter.h>
+
 namespace TNL {
 namespace Functions {   
 
@@ -34,7 +36,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 1, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -61,7 +63,6 @@ write( const MeshFunctionType& function,
    const RealType spaceStep = mesh.getSpaceSteps().x();
  
    str << "POINTS " << mesh.getDimensions().x() + 1 << " float" << std::endl;
- 
    for (int i = 0; i <= mesh.getDimensions().x(); i++)
    {
        str << origin + i * spaceStep << " 0 0" << std::endl;
@@ -83,11 +84,9 @@ write( const MeshFunctionType& function,
    str << "SCALARS cellFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Cell entity( mesh );
-   for( entity.getCoordinates().x() = 0;
-        entity.getCoordinates().x() < mesh.getDimensions().x();
-        entity.getCoordinates().x() ++ )
+   for( MeshIndex i = 0; i < mesh.template getEntitiesCount< typename MeshType::Cell >(); i++ )
    {
+      typename MeshType::Cell entity = mesh.template getEntity< typename MeshType::Cell >( i );
       entity.refresh();
       str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
@@ -106,7 +105,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 0, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -133,7 +132,6 @@ write( const MeshFunctionType& function,
    const RealType spaceStep = mesh.getSpaceSteps().x();
  
    str << "POINTS " << mesh.getDimensions().x() + 1 << " float" << std::endl;
- 
    for (int i = 0; i < mesh.getDimensions().x() + 1; i++)
    {
        str << origin + i * spaceStep << " 0 0" << std::endl;
@@ -155,11 +153,9 @@ write( const MeshFunctionType& function,
    str << "SCALARS VerticesFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Vertex entity( mesh );
-   for( entity.getCoordinates().x() = 0;
-        entity.getCoordinates().x() <= mesh.getDimensions().x();
-        entity.getCoordinates().x() ++ )
+   for( MeshIndex i = 0; i < mesh.template getEntitiesCount< typename MeshType::Vertex >(); i++ )
    {
+      typename MeshType::Vertex entity = mesh.template getEntity< typename MeshType::Vertex >( i );
       entity.refresh();
       str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
@@ -178,7 +174,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 2, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -205,9 +201,10 @@ write( const MeshFunctionType& function,
    const RealType spaceStepX = mesh.getSpaceSteps().x();
    const RealType originY = mesh.getOrigin().y();
    const RealType spaceStepY = mesh.getSpaceSteps().y();
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
+   const MeshIndex entitiesCount = mesh.template getEntitiesCount< typename MeshType::Cell >();
  
-   str << "POINTS " << (mesh.getDimensions().x() + 1) * (mesh.getDimensions().y() + 1) << " float" << std::endl;
- 
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int j = 0; j < mesh.getDimensions().y() + 1; j++)
    {
         for (int i = 0; i < mesh.getDimensions().x() + 1; i++)
@@ -216,8 +213,7 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELLS " << mesh.getDimensions().x() * mesh.getDimensions().y() << " " <<
-          mesh.getDimensions().x() * mesh.getDimensions().y() * 5 << std::endl;
+   str << std::endl << "CELLS " << entitiesCount << " " << entitiesCount * 5 << std::endl;
    for (int j = 0; j < mesh.getDimensions().y(); j++)
    {
         for (int i = 0; i < mesh.getDimensions().x(); i++)
@@ -233,23 +229,17 @@ write( const MeshFunctionType& function,
        str << "8 " << std::endl;
    }
  
-   str << std::endl << "CELL_DATA " << mesh.getDimensions().x() * mesh.getDimensions().y() << std::endl;
+   str << std::endl << "CELL_DATA " << entitiesCount << std::endl;
    str << "SCALARS cellFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Cell entity( mesh );
-   for( entity.getCoordinates().y() = 0;
-        entity.getCoordinates().y() < mesh.getDimensions().y();
-        entity.getCoordinates().y() ++ )
+   for( MeshIndex i = 0; i < entitiesCount; i++ )
    {
-      for( entity.getCoordinates().x() = 0;
-           entity.getCoordinates().x() < mesh.getDimensions().x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         str << function.getData().getElement( entity.getIndex() ) << std::endl;
-      }
+      typename MeshType::Cell entity = mesh.template getEntity< typename MeshType::Cell >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
+
    return true;
 }
 
@@ -264,7 +254,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 1, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -293,9 +283,10 @@ write( const MeshFunctionType& function,
    const RealType spaceStepX = mesh.getSpaceSteps().x();
    const RealType originY = mesh.getOrigin().y();
    const RealType spaceStepY = mesh.getSpaceSteps().y();
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
+   const MeshIndex entitiesCount = mesh.template getEntitiesCount< typename MeshType::Face >();
  
-   str << "POINTS " << mesh.template getEntitiesCount< Vertex >() << " float" << std::endl;
- 
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int j = 0; j < ( mesh.getDimensions().y() + 1); j++)
    {
         for (int i = 0; i < ( mesh.getDimensions().x() + 1 ); i++)
@@ -304,8 +295,7 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELLS " << mesh.template getEntitiesCount< Face >() << " " <<
-          mesh.template getEntitiesCount< Face >() * 3 << std::endl;
+   str << std::endl << "CELLS " << entitiesCount << " " << entitiesCount * 3 << std::endl;
    for (int j = 0; j < mesh.getDimensions().y(); j++)
    {
         for (int i = 0; i < ( mesh.getDimensions().x() + 1 ); i++)
@@ -322,49 +312,23 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELL_TYPES " << mesh.template getEntitiesCount< Face >() << std::endl;
-   for (int i = 0; i < mesh.template getEntitiesCount< Face >(); i++)
+   str << std::endl << "CELL_TYPES " << entitiesCount << std::endl;
+   for (int i = 0; i < entitiesCount; i++)
    {
        str << "3" << std::endl;
    }
  
-   str << std::endl << "CELL_DATA " << mesh.template getEntitiesCount< Face >() << std::endl;
+   str << std::endl << "CELL_DATA " << entitiesCount << std::endl;
    str << "SCALARS FaceslFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typedef typename MeshType::Face EntityType;
-   typedef typename EntityType::EntityOrientationType EntityOrientation;
-   EntityType entity( mesh );
- 
-   entity.setOrientation( EntityOrientation( 1.0, 0.0 ) );
-   for( entity.getCoordinates().y() = 0;
-        entity.getCoordinates().y() < mesh.getDimensions().y();
-        entity.getCoordinates().y() ++ )
+   for( MeshIndex i = 0; i < entitiesCount; i++ )
    {
-      for( entity.getCoordinates().x() = 0;
-           entity.getCoordinates().x() <= mesh.getDimensions().x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         str << function.getData().getElement( entity.getIndex() ) << std::endl;
-      }
+      typename MeshType::Face entity = mesh.template getEntity< typename MeshType::Face >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
- 
-   entity.setOrientation( EntityOrientation( 0.0, 1.0 ) );
-   for( entity.getCoordinates().y() = 0;
-        entity.getCoordinates().y() <= mesh.getDimensions().y();
-        entity.getCoordinates().y() ++ )
 
-   {
-        for( entity.getCoordinates().x() = 0;
-             entity.getCoordinates().x() < mesh.getDimensions().x();
-             entity.getCoordinates().x() ++ )
-
-      {
-         entity.refresh();
-         str << function.getData().getElement( entity.getIndex() ) << std::endl;
-      }
-   }
    return true;
 }
 
@@ -379,7 +343,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 0, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -407,10 +371,9 @@ write( const MeshFunctionType& function,
    const RealType spaceStepX = mesh.getSpaceSteps().x();
    const RealType originY = mesh.getOrigin().y();
    const RealType spaceStepY = mesh.getSpaceSteps().y();
-
- 
-   str << "POINTS " << mesh.template getEntitiesCount< Vertex >() << " float" << std::endl;
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
  
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int j = 0; j < ( mesh.getDimensions().y() + 1); j++)
    {
         for (int i = 0; i < ( mesh.getDimensions().x() + 1 ); i++)
@@ -419,8 +382,7 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELLS " << mesh.template getEntitiesCount< Vertex >() << " " <<
-          mesh.template getEntitiesCount< Vertex >() * 2 << std::endl;
+   str << std::endl << "CELLS " << verticesCount << " " << verticesCount * 2 << std::endl;
    for (int j = 0; j < ( mesh.getDimensions().y() + 1 ); j++)
    {
         for (int i = 0; i < ( mesh.getDimensions().x() + 1 ); i++)
@@ -429,30 +391,23 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELL_TYPES " << mesh.template getEntitiesCount< Vertex >() << std::endl;
-   for (int i = 0; i < mesh.template getEntitiesCount< Vertex >(); i++)
+   str << std::endl << "CELL_TYPES " << verticesCount << std::endl;
+   for (int i = 0; i < verticesCount; i++)
    {
        str << "1" << std::endl;
    }
  
-   str << std::endl << "CELL_DATA " << mesh.template getEntitiesCount< Vertex >() << std::endl;
+   str << std::endl << "CELL_DATA " << verticesCount << std::endl;
    str << "SCALARS VerticesFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Vertex entity( mesh );
-   for( entity.getCoordinates().y() = 0;
-        entity.getCoordinates().y() <= mesh.getDimensions().y();
-        entity.getCoordinates().y() ++ )
+   for( MeshIndex i = 0; i < verticesCount; i++ )
    {
-      for( entity.getCoordinates().x() = 0;
-           entity.getCoordinates().x() <= mesh.getDimensions().x();
-           entity.getCoordinates().x() ++ )
-      {
-         entity.refresh();
-         str << function.getData().getElement( entity.getIndex() ) << std::endl;
-      }
+      typename MeshType::Vertex entity = mesh.template getEntity< typename MeshType::Vertex >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
- 
+
    return true;
 }
 
@@ -467,7 +422,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -496,11 +451,10 @@ write( const MeshFunctionType& function,
    const RealType spaceStepY = mesh.getSpaceSteps().y();
    const RealType originZ = mesh.getOrigin().z();
    const RealType spaceStepZ = mesh.getSpaceSteps().z();
-   const RealType entitiesCount = mesh.getDimensions().x() * mesh.getDimensions().y() * mesh.getDimensions().z();
- 
-   str << "POINTS " << (mesh.getDimensions().x()+1) * (mesh.getDimensions().y()+1) * (mesh.getDimensions().z()+1) <<
-          " float" << std::endl;
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
+   const MeshIndex entitiesCount = mesh.template getEntitiesCount< typename MeshType::Cell >();
  
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int k = 0; k <= mesh.getDimensions().y(); k++)
    {
        for (int j = 0; j <= mesh.getDimensions().y(); j++)
@@ -543,24 +497,13 @@ write( const MeshFunctionType& function,
    str << "SCALARS cellFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Cell entity( mesh );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() < mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
+   for( MeshIndex i = 0; i < entitiesCount; i++ )
    {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() < mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() < mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                entity.refresh();
-                str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
+      typename MeshType::Cell entity = mesh.template getEntity< typename MeshType::Cell >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
+
    return true;
 }
 
@@ -575,7 +518,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -595,8 +538,6 @@ MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshInde
 write( const MeshFunctionType& function,
        std::ostream& str )
 {
-   typedef typename MeshType::template MeshEntity< 2 > Face;
-   typedef typename MeshType::template MeshEntity< 3 > Cell;
    writeHeader(function, str);
  
    const MeshType& mesh = function.getMesh();
@@ -606,12 +547,10 @@ write( const MeshFunctionType& function,
    const RealType spaceStepY = mesh.getSpaceSteps().y();
    const RealType originZ = mesh.getOrigin().z();
    const RealType spaceStepZ = mesh.getSpaceSteps().z();
-   const RealType entitiesCount = mesh.template getEntitiesCount< Face >();
-   const RealType pointsCount = mesh.template getEntitiesCount< Cell >();
- 
-   str << "POINTS " << pointsCount <<
-          " float" << std::endl;
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
+   const MeshIndex entitiesCount = mesh.template getEntitiesCount< typename MeshType::Face >();
  
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int k = 0; k <= mesh.getDimensions().y(); k++)
    {
        for (int j = 0; j <= mesh.getDimensions().y(); j++)
@@ -624,8 +563,7 @@ write( const MeshFunctionType& function,
        }
    }
  
-   str << std::endl << "CELLS " << entitiesCount << " " <<
-          entitiesCount * 5 << std::endl;
+   str << std::endl << "CELLS " << entitiesCount << " " << entitiesCount * 5 << std::endl;
    for (int k = 0; k < mesh.getDimensions().z(); k++)
    {
         for (int j = 0; j < mesh.getDimensions().y(); j++)
@@ -678,67 +616,13 @@ write( const MeshFunctionType& function,
    str << "SCALARS facesFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typedef typename MeshType::Face EntityType;
-   typedef typename EntityType::EntityOrientationType EntityOrientation;
-   EntityType entity( mesh );
- 
-   entity.setOrientation( EntityOrientation( 1.0, 0.0 , 0.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() < mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
-   {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() < mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() <= mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
-   }
- 
-   entity.setOrientation( EntityOrientation( 0.0, 1.0 , 0.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() < mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
-   {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() <= mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() < mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
-   }
- 
-   entity.setOrientation( EntityOrientation( 0.0, 0.0 , 1.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() <= mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
+   for( MeshIndex i = 0; i < entitiesCount; i++ )
    {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() < mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() < mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
+      typename MeshType::Face entity = mesh.template getEntity< typename MeshType::Face >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
- 
+
    return true;
 }
 
@@ -753,7 +637,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 1, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -773,8 +657,6 @@ MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshInde
 write( const MeshFunctionType& function,
        std::ostream& str )
 {
-   typedef typename MeshType::template MeshEntity< 1 > Edge;
-   typedef typename MeshType::template MeshEntity< 3 > Cell;
    writeHeader(function, str);
  
    const MeshType& mesh = function.getMesh();
@@ -784,12 +666,10 @@ write( const MeshFunctionType& function,
    const RealType spaceStepY = mesh.getSpaceSteps().y();
    const RealType originZ = mesh.getOrigin().z();
    const RealType spaceStepZ = mesh.getSpaceSteps().z();
-   const RealType entitiesCount = mesh.template getEntitiesCount< Edge >();
-   const RealType pointsCount = mesh.template getEntitiesCount< Cell >();
- 
-   str << "POINTS " << pointsCount <<
-          " float" << std::endl;
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
+   const MeshIndex entitiesCount = mesh.template getEntitiesCount< typename MeshType::Edge >();
  
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int k = 0; k <= mesh.getDimensions().y(); k++)
    {
        for (int j = 0; j <= mesh.getDimensions().y(); j++)
@@ -802,8 +682,7 @@ write( const MeshFunctionType& function,
        }
    }
  
-   str << std::endl << "CELLS " << entitiesCount << " " <<
-          entitiesCount * 3 << std::endl;
+   str << std::endl << "CELLS " << entitiesCount << " " << entitiesCount * 3 << std::endl;
    for (int k = 0; k <= mesh.getDimensions().z(); k++)
    {
         for (int j = 0; j <= mesh.getDimensions().y(); j++)
@@ -850,67 +729,13 @@ write( const MeshFunctionType& function,
    str << "SCALARS edgesFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typedef typename MeshType::Face EntityType;
-   typedef typename EntityType::EntityOrientationType EntityOrientation;
-   EntityType entity( mesh );
- 
-   entity.setOrientation( EntityOrientation( 1.0, 0.0 , 0.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() <= mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
+   for( MeshIndex i = 0; i < entitiesCount; i++ )
    {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() <= mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() < mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
-   }
- 
-   entity.setOrientation( EntityOrientation( 0.0, 1.0 , 0.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() <= mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
-   {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() < mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() <= mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
-   }
- 
-   entity.setOrientation( EntityOrientation( 0.0, 0.0 , 1.0) );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() < mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
-   {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() <= mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() <= mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
+      typename MeshType::Edge entity = mesh.template getEntity< typename MeshType::Edge >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
- 
+
    return true;
 }
 
@@ -925,7 +750,7 @@ template< typename MeshReal,
 void
 MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0, Real > >::
 writeHeader( const MeshFunctionType& function,
-       std::ostream& str )
+             std::ostream& str )
 {
     const MeshType& mesh = function.getMesh();
     const typename MeshType::VertexType& origin = mesh.getOrigin();
@@ -945,7 +770,6 @@ MeshFunctionVTKWriter< MeshFunction< Meshes::Grid< 3, MeshReal, Device, MeshInde
 write( const MeshFunctionType& function,
        std::ostream& str )
 {
-   typedef typename MeshType::template MeshEntity< 0 > Vertex;
    writeHeader(function, str);
  
    const MeshType& mesh = function.getMesh();
@@ -955,9 +779,9 @@ write( const MeshFunctionType& function,
    const RealType spaceStepY = mesh.getSpaceSteps().y();
    const RealType originZ = mesh.getOrigin().z();
    const RealType spaceStepZ = mesh.getSpaceSteps().z();
+   const MeshIndex verticesCount = mesh.template getEntitiesCount< typename MeshType::Vertex >();
  
-   str << "POINTS " << mesh.template getEntitiesCount< Vertex >() << " float" << std::endl;
- 
+   str << "POINTS " << verticesCount << " float" << std::endl;
    for (int k = 0; k <= mesh.getDimensions().y(); k++)
    {
        for (int j = 0; j <= mesh.getDimensions().y(); j++)
@@ -970,8 +794,7 @@ write( const MeshFunctionType& function,
        }
    }
  
-   str << std::endl << "CELLS " << mesh.template getEntitiesCount< Vertex >() << " " <<
-          mesh.template getEntitiesCount< Vertex >() * 2 << std::endl;
+   str << std::endl << "CELLS " << verticesCount << " " << verticesCount * 2 << std::endl;
    for (int k = 0; k < ( mesh.getDimensions().z() + 1 ); k++)
    {
         for (int j = 0; j < ( mesh.getDimensions().y() + 1 ); j++)
@@ -983,35 +806,23 @@ write( const MeshFunctionType& function,
         }
    }
  
-   str << std::endl << "CELL_TYPES " << mesh.template getEntitiesCount< Vertex >() << std::endl;
-   for (int i = 0; i < mesh.template getEntitiesCount< Vertex >(); i++)
+   str << std::endl << "CELL_TYPES " << verticesCount << std::endl;
+   for (int i = 0; i < verticesCount; i++)
    {
        str << "1" << std::endl;
    }
  
-   str << std::endl << "CELL_DATA " << mesh.template getEntitiesCount< Vertex >() << std::endl;
+   str << std::endl << "CELL_DATA " << verticesCount << std::endl;
    str << "SCALARS verticesFunctionValues float 1" << std::endl;
    str << "LOOKUP_TABLE default" << std::endl;
 
-   typename MeshType::Vertex entity( mesh );
-   for( entity.getCoordinates().z() = 0;
-        entity.getCoordinates().z() <= mesh.getDimensions().z();
-        entity.getCoordinates().z() ++ )
+   for( MeshIndex i = 0; i < verticesCount; i++ )
    {
-        for( entity.getCoordinates().y() = 0;
-             entity.getCoordinates().y() <= mesh.getDimensions().y();
-             entity.getCoordinates().y() ++ )
-        {
-            for( entity.getCoordinates().x() = 0;
-                 entity.getCoordinates().x() <= mesh.getDimensions().x();
-                 entity.getCoordinates().x() ++ )
-            {
-                 entity.refresh();
-                 str << function.getData().getElement( entity.getIndex() ) << std::endl;
-            }
-        }
+      typename MeshType::Vertex entity = mesh.template getEntity< typename MeshType::Vertex >( i );
+      entity.refresh();
+      str << function.getData().getElement( entity.getIndex() ) << std::endl;
    }
- 
+
    return true;
 }
 
diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h
index f41cc28a0bc1aff4a3b60e9b8ef3668452197848..53ab95680738b8ee77c7a76ca7de9e7344f3f3a8 100644
--- a/src/TNL/Functions/MeshFunction_impl.h
+++ b/src/TNL/Functions/MeshFunction_impl.h
@@ -37,7 +37,7 @@ MeshFunction( const MeshPointer& meshPointer )
 : meshPointer( meshPointer )
 {
    this->data.setSize( meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );
 }
@@ -47,7 +47,7 @@ template< typename Mesh,
           typename Real >
 MeshFunction< Mesh, MeshEntityDimensions, Real >::
 MeshFunction( const ThisType& meshFunction )
-: meshPointer( meshPointer )
+: meshPointer( meshFunction.meshPointer )
 {
    this->data.bind( meshFunction.getData() );
 }
@@ -63,7 +63,7 @@ MeshFunction( const MeshPointer& meshPointer,
 : meshPointer( meshPointer )
 {
    this->data.bind( data, offset, meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );   
 }
@@ -80,7 +80,7 @@ MeshFunction( const MeshPointer& meshPointer,
 : meshPointer( meshPointer )
 {
    this->data.bind( *data, offset, meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );   
 }
@@ -191,7 +191,7 @@ bind( const MeshPointer& meshPointer,
 {
    this->meshPointer = meshPointer;
    this->data.bind( data, offset, meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );   
 }
@@ -208,7 +208,7 @@ bind( const MeshPointer& meshPointer,
 {
    this->meshPointer = meshPointer;
    this->data.bind( *data, offset, meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );   
 }
@@ -223,7 +223,7 @@ setMesh( const MeshPointer& meshPointer )
 {
    this->meshPointer = meshPointer;
    this->data.setSize( meshPointer->template getEntitiesCount< typename Mesh::template MeshEntity< MeshEntityDimensions > >() );
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );   
 }
@@ -436,7 +436,7 @@ bool
 MeshFunction< Mesh, MeshEntityDimensions, Real >::
 save( File& file ) const
 {
-   Assert( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
+   TNL_ASSERT( this->data.getSize() == this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >(), 
       std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl
                 << "this->mesh->template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() = " << this->meshPointer.getData().template getEntitiesCount< typename MeshType::template MeshEntity< MeshEntityDimensions > >() );
    if( ! Object::save( file ) )
@@ -493,8 +493,12 @@ write( const String& fileName,
    }
    if( format == "vtk" )
       return MeshFunctionVTKWriter< ThisType >::write( *this, file );
-   if( format == "gnuplot" )
+   else if( format == "gnuplot" )
       return MeshFunctionGnuplotWriter< ThisType >::write( *this, file );
+   else {
+      std::cerr << "Unknown output format: " << format << std::endl;
+      return false;
+   }
    return true;
 }
  
diff --git a/src/TNL/Functions/OperatorFunction.h b/src/TNL/Functions/OperatorFunction.h
index 842383b29a7916ac33fa5c8e4dee9ef04f2b6c7f..628191165d38d3ae12052091e15b01ba43e852b7 100644
--- a/src/TNL/Functions/OperatorFunction.h
+++ b/src/TNL/Functions/OperatorFunction.h
@@ -84,13 +84,13 @@ class OperatorFunction< Operator, MeshFunctionT, void, true >
  
       const MeshType& getMesh() const
       {
-         Assert( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
+         TNL_ASSERT( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
          return this->preimageFunction->getMesh();
       };
       
       const MeshPointer& getMeshPointer() const
       { 
-         tnlAssert( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
+         tnlTNL_ASSERT( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
          return this->preimageFunction->getMeshPointer(); 
       };
 
@@ -111,7 +111,7 @@ class OperatorFunction< Operator, MeshFunctionT, void, true >
          const MeshEntity& meshEntity,
          const RealType& time = 0.0 ) const
       {
-         Assert( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
+         TNL_ASSERT( this->preimageFunction, std::cerr << "The preimage function was not set." << std::endl );
          return operator_( *preimageFunction, meshEntity, time );
       }
  
@@ -293,13 +293,13 @@ class OperatorFunction< Operator, PreimageFunction, BoundaryConditions, false >
  
       const PreimageFunctionType& getPreimageFunction() const
       {
-         Assert( this->preimageFunction, );
+         TNL_ASSERT( this->preimageFunction, );
          return *this->preimageFunction;
       };
  
       PreimageFunctionType& getPreimageFunction()
       {
-         Assert( this->preimageFunction, );
+         TNL_ASSERT( this->preimageFunction, );
          return *this->preimageFunction;
       };
  
diff --git a/src/TNL/Functions/TestFunction_impl.h b/src/TNL/Functions/TestFunction_impl.h
index 35713a8944099edbef223c4017ade5cba749203d..506f536518d1946e33da00f749097b168bd8d09d 100644
--- a/src/TNL/Functions/TestFunction_impl.h
+++ b/src/TNL/Functions/TestFunction_impl.h
@@ -247,7 +247,7 @@ operator = ( const TestFunction& function )
          this->copyFunction< Blob< FunctionDimensions, Real > >( function.function );
          break;
       default:
-         Assert( false, );
+         TNL_ASSERT( false, );
          break;
    }
 
@@ -460,7 +460,7 @@ copyFunction( const void* function )
    }
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
       abort();
    }
 }
diff --git a/src/TNL/Images/DicomSeries.h b/src/TNL/Images/DicomSeries.h
index 1350cf4a280b7ba0ad89149f818a5476f29a98e1..b3ee1fb11c88f221512e6c27071f60fbdafddbc8 100644
--- a/src/TNL/Images/DicomSeries.h
+++ b/src/TNL/Images/DicomSeries.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/String.h>
 #include <TNL/param-types.h>
 #include <TNL/Images//Image.h>
@@ -106,7 +106,7 @@ class DicomSeries : public Image< int >
  
       bool loadImage( const String& filePath, int number );
 
-      List< String > fileList;
+      Containers::List< String > fileList;
  
       Containers::Array<DicomHeader *,Devices::Host,int> dicomSeriesHeaders;
 
diff --git a/src/TNL/Images/DicomSeries_impl.h b/src/TNL/Images/DicomSeries_impl.h
index 5d0abfea4a386dd4fcb6b5fc1590ced960b56955..e9a3f064c78925c2dc333e43f56a50d8a86f0c4a 100644
--- a/src/TNL/Images/DicomSeries_impl.h
+++ b/src/TNL/Images/DicomSeries_impl.h
@@ -155,7 +155,7 @@ inline bool DicomSeries::retrieveFileList( const String& filePath)
       String fileNamePrefix(fileName.getString(), 0, fileName.getLength() - separatorPosition);
 
       struct dirent **dirp;
-      List<String > files;
+      Containers::List<String > files;
 
       //scan and sort directory
       int ndirs = scandir(directoryPath.getString(), &dirp, filter, alphasort);
diff --git a/src/TNL/Logger.cpp b/src/TNL/Logger.cpp
index fdded84ad5dc5798caaf0bed5ba41ec0e62804e2..6287cc3b3337c8ec78ca4039e404e8a316f192f2 100644
--- a/src/TNL/Logger.cpp
+++ b/src/TNL/Logger.cpp
@@ -10,8 +10,7 @@
 
 #include <iomanip>
 #include <TNL/Logger.h>
-#include <TNL/tnlConfig.h>
-#include <TNL/SystemInfo.h>
+#include <TNL/Devices/Host.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 
 namespace TNL {
@@ -45,90 +44,41 @@ void Logger :: writeSeparator()
 
 bool Logger :: writeSystemInformation( const Config::ParameterContainer& parameters )
 {
-   SystemInfo systemInfo;
-
-
-   writeParameter< String >( "Host name:", systemInfo.getHostname() );
-   writeParameter< String >( "Architecture:", systemInfo.getArchitecture() );
-   // FIXME: generalize for multi-socket systems, here we consider only the first found CPU
-   const int cpu_id = 0;
-   const int threads = systemInfo.getNumberOfThreads( cpu_id );
-   const int cores = systemInfo.getNumberOfCores( cpu_id );
-   int threadsPerCore = threads / cores;
-   writeParameter< String >( "CPU info", String("") );
-   writeParameter< String >( "Model name:", systemInfo.getCPUModelName( cpu_id ), 1 );
-   writeParameter< int >( "Cores:", cores, 1 );
-   writeParameter< int >( "Threads per core:", threadsPerCore, 1 );
-   writeParameter< String >( "Max clock rate (in MHz):", systemInfo.getCPUMaxFrequency( cpu_id ) / 1000, 1 );
-   tnlCacheSizes cacheSizes = systemInfo.getCPUCacheSizes( cpu_id );
-   String cacheInfo = String( cacheSizes.L1data ) + ", "
-                       + String( cacheSizes.L1instruction ) + ", "
-                       + String( cacheSizes.L2 ) + ", "
-                       + String( cacheSizes.L3 );
-   writeParameter< String >( "Cache (L1d, L1i, L2, L3):", cacheInfo, 1 );
+   Devices::Host::writeDeviceInfo( *this );
    if( parameters.getParameter< String >( "device" ) == "cuda" )
-   {
-      writeParameter< String >( "CUDA GPU info", String("") );
-      // TODO: Printing all devices does not make sense, but in the future TNL
-      //       might use more than one device for computations. Printing only
-      //       the active device for now...
-//      int devices = Devices::CudaDeviceInfo::getNumberOfDevices();
-//      writeParameter< int >( "Number of devices", devices, 1 );
-//      for( int i = 0; i < devices; i++ )
-//      {
-//        writeParameter< int >( "Device no.", i, 1 );
-        int i = Devices::CudaDeviceInfo::getActiveDevice();
-        writeParameter< String >( "Name", Devices::CudaDeviceInfo::getDeviceName( i ), 2 );
-        String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( i ) ) + "." +
-                                String( Devices::CudaDeviceInfo::getArchitectureMinor( i ) );
-        writeParameter< String >( "Architecture", deviceArch, 2 );
-        writeParameter< int >( "CUDA cores", Devices::CudaDeviceInfo::getCudaCores( i ), 2 );
-        double clockRate = ( double ) Devices::CudaDeviceInfo::getClockRate( i ) / 1.0e3;
-        writeParameter< double >( "Clock rate (in MHz)", clockRate, 2 );
-        double globalMemory = ( double ) Devices::CudaDeviceInfo::getGlobalMemory( i ) / 1.0e9;
-        writeParameter< double >( "Global memory (in GB)", globalMemory, 2 );
-        double memoryClockRate = ( double ) Devices::CudaDeviceInfo::getMemoryClockRate( i ) / 1.0e3;
-        writeParameter< double >( "Memory clock rate (in Mhz)", memoryClockRate, 2 );
-        writeParameter< bool >( "ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( i ), 2 );
-//      }
-   }
-   writeParameter< String >( "System:", systemInfo.getSystemName() );
-   writeParameter< String >( "Release:", systemInfo.getSystemRelease() );
-   writeParameter< char* >( "TNL Compiler:", ( char* ) TNL_CPP_COMPILER_NAME );
+      Devices::CudaDeviceInfo::writeDeviceInfo( *this );
    return true;
 }
 
 void Logger :: writeCurrentTime( const char* label )
 {
-   SystemInfo systemInfo;
-   writeParameter< String >( label, systemInfo.getCurrentTime() );
+   writeParameter< String >( label, Devices::Host::getCurrentTime() );
 }
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 template void Logger::writeParameter< char* >( const String&,
-                                                  const String&,
-                                                  const Config::ParameterContainer&,
-                                                  int );
+                                               const String&,
+                                               const Config::ParameterContainer&,
+                                               int );
 template void Logger::writeParameter< double >( const String&,
-                                                   const String&,
-                                                   const Config::ParameterContainer&,
-                                                   int );
-template void Logger::writeParameter< int >( const String&,
                                                 const String&,
                                                 const Config::ParameterContainer&,
                                                 int );
+template void Logger::writeParameter< int >( const String&,
+                                             const String&,
+                                             const Config::ParameterContainer&,
+                                             int );
 
 // TODO: fix this
 //template void Logger :: WriteParameter< char* >( const char*,
-//                                                    const char*&,
-//                                                    int );
+//                                                 const char*&,
+//                                                 int );
 template void Logger::writeParameter< double >( const String&,
-                                                   const double&,
-                                                   int );
-template void Logger::writeParameter< int >( const String&,
-                                                const int&,
+                                                const double&,
                                                 int );
-
+template void Logger::writeParameter< int >( const String&,
+                                             const int&,
+                                             int );
 #endif
 
 } // namespace TNL
diff --git a/src/TNL/Logger.h b/src/TNL/Logger.h
index 90dd927da5547af88ef5b885a2192c449fd3cd74..791398649d8d61f89a984cbb8255bb0be22a69d9 100644
--- a/src/TNL/Logger.h
+++ b/src/TNL/Logger.h
@@ -26,7 +26,6 @@ class Logger
 
    void writeSeparator();
 
-   // TODO: move this to Devices::Host
    bool writeSystemInformation( const Config::ParameterContainer& parameters );
  
 
@@ -59,28 +58,28 @@ namespace TNL {
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 extern template void Logger::writeParameter< char* >( const String&,
-                                                         const String&,
-                                                         const Config::ParameterContainer&,
-                                                         int );
+                                                      const String&,
+                                                      const Config::ParameterContainer&,
+                                                      int );
 extern template void Logger::writeParameter< double >( const String&,
-                                                          const String&,
-                                                          const Config::ParameterContainer&,
-                                                          int );
-extern template void Logger::writeParameter< int >( const String&,
                                                        const String&,
                                                        const Config::ParameterContainer&,
                                                        int );
+extern template void Logger::writeParameter< int >( const String&,
+                                                    const String&,
+                                                    const Config::ParameterContainer&,
+                                                    int );
 
 // TODO: fix this
 //extern template void Logger :: WriteParameter< char* >( const char*,
-//                                                           const char*&,
-//                                                           int );
+//                                                        const char*&,
+//                                                        int );
 extern template void Logger::writeParameter< double >( const String&,
-                                                          const double&,
-                                                          int );
-extern template void Logger::writeParameter< int >( const String&,
-                                                       const int&,
+                                                       const double&,
                                                        int );
+extern template void Logger::writeParameter< int >( const String&,
+                                                    const int&,
+                                                    int );
 #endif
 
 } // namespace TNL
diff --git a/src/TNL/Matrices/CSR.h b/src/TNL/Matrices/CSR.h
index e42fca6b73990d05e1d5312d61b7022ecc6f2b49..7c249944bfdd7ef062078f274492cb033d183eb5 100644
--- a/src/TNL/Matrices/CSR.h
+++ b/src/TNL/Matrices/CSR.h
@@ -41,6 +41,7 @@ class CSR : public Sparse< Real, Device, Index >
    typedef CSR< Real, Devices::Cuda, Index > CudaType;
    typedef Sparse< Real, Device, Index > BaseType;
    typedef typename BaseType::MatrixRow MatrixRow;
+   typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
 
 
    enum SPMVCudaKernel { scalar, vector, hybrid };
@@ -125,7 +126,7 @@ class CSR : public Sparse< Real, Device, Index >
    MatrixRow getRow( const IndexType rowIndex );
 
    __cuda_callable__
-   const MatrixRow getRow( const IndexType rowIndex ) const;
+   ConstMatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
    __cuda_callable__
@@ -198,6 +199,38 @@ class CSR : public Sparse< Real, Device, Index >
                            int gridIdx ) const;
 #endif
 
+   // The following getters allow us to interface TNL with external C-like
+   // libraries such as UMFPACK or SuperLU, which need the raw data.
+   Index* getRowPointers()
+   {
+       return this->rowPointers.getData();
+   }
+
+   const Index* getRowPointers() const
+   {
+       return this->rowPointers.getData();
+   }
+
+   Index* getColumnIndexes()
+   {
+       return this->columnIndexes.getData();
+   }
+
+   const Index* getColumnIndexes() const
+   {
+       return this->columnIndexes.getData();
+   }
+
+   Real* getValues()
+   {
+       return this->values.getData();
+   }
+
+   const Real* getValues() const
+   {
+       return this->values.getData();
+   }
+
    protected:
 
    Containers::Vector< Index, Device, Index > rowPointers;
@@ -209,11 +242,6 @@ class CSR : public Sparse< Real, Device, Index >
    typedef CSRDeviceDependentCode< DeviceType > DeviceDependentCode;
    friend class CSRDeviceDependentCode< DeviceType >;
    friend class tnlCusparseCSR< RealType >;
-#ifdef HAVE_UMFPACK
-    template< typename Matrix, typename Preconditioner >
-    friend class UmfpackWrapper;
-#endif
-
 };
 
 } // namespace Matrices
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 6d52662577bf6ad976b066e1b524f0665de9fd96..e021dc52c466788bbc39b6552e71420fb128be6d 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -82,8 +82,8 @@ bool CSR< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsL
     * necessary length of the vectors this->values
     * and this->columnIndexes.
     */
-   Assert( this->getRows() > 0, );
-   Assert( this->getColumns() > 0, );
+   TNL_ASSERT( this->getRows() > 0, );
+   TNL_ASSERT( this->getColumns() > 0, );
    Containers::SharedVector< IndexType, DeviceType, IndexType > rowPtrs;
    rowPtrs.bind( this->rowPointers.getData(), this->getRows() );
    rowPtrs = rowLengths;
@@ -163,7 +163,7 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row,
                                                           const RealType& value,
                                                           const RealType& thisElementMultiplicator )
 {
-   /*Assert( row >= 0 && row < this->rows &&
+   /*TNL_ASSERT( row >= 0 && row < this->rows &&
               column >= 0 && column <= this->rows,
               std::cerr << " row = " << row
                    << " column = " << column
@@ -213,7 +213,7 @@ bool CSR< Real, Device, Index >::addElement( const IndexType row,
                                                       const RealType& value,
                                                       const RealType& thisElementMultiplicator )
 {
-   Assert( row >= 0 && row < this->rows &&
+   TNL_ASSERT( row >= 0 && row < this->rows &&
                column >= 0 && column < this->columns,
                std::cerr << " row = " << row
                     << " column = " << column
@@ -406,16 +406,16 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-const typename CSR< Real, Device, Index >::MatrixRow
+typename CSR< Real, Device, Index >::ConstMatrixRow
 CSR< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
 {
    const IndexType rowOffset = this->rowPointers[ rowIndex ];
    const IndexType rowLength = this->rowPointers[ rowIndex + 1 ] - rowOffset;
-   return MatrixRow( &this->columnIndexes[ rowOffset ],
-                     &this->values[ rowOffset ],
-                     rowLength,
-                     1 );
+   return ConstMatrixRow( &this->columnIndexes[ rowOffset ],
+                          &this->values[ rowOffset ],
+                          rowLength,
+                          1 );
 }
 
 template< typename Real,
@@ -455,7 +455,7 @@ void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >&
                                             const RealType& matrixMultiplicator,
                                             const RealType& thisMatrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -467,7 +467,7 @@ template< typename Real,
 void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Index2 >& matrix,
                                                                       const RealType& matrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -480,7 +480,7 @@ bool CSR< Real, Device, Index >::performSORIteration( const Vector& b,
                                                       Vector& x,
                                                       const RealType& omega ) const
 {
-   Assert( row >=0 && row < this->getRows(),
+   TNL_ASSERT( row >=0 && row < this->getRows(),
               std::cerr << "row = " << row
                    << " this->getRows() = " << this->getRows() << std::endl );
 
@@ -629,7 +629,7 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
                                                               const IndexType warpEnd,
                                                               const IndexType inWarpIdx ) const
 {
-   volatile Real* aux = Devices::getSharedMemory< Real >();
+   volatile Real* aux = Devices::Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
       aux[ threadIdx.x ] = 0.0;
@@ -723,7 +723,7 @@ class CSRDeviceDependentCode< Devices::Host >
          const InVector* inVectorPtr = &inVector;
          OutVector* outVectorPtr = &outVector;
 #ifdef HAVE_OPENMP
-#pragma omp parallel for firstprivate( matrixPtr, inVectorPtr, outVectorPtr ), schedule(static ), if( Devices::Host::isOMPEnabled() )
+#pragma omp parallel for firstprivate( matrixPtr, inVectorPtr, outVectorPtr ), schedule(dynamic,100), if( Devices::Host::isOMPEnabled() )
 #endif
          for( Index row = 0; row < rows; row ++ )
             ( *outVectorPtr )[ row ] = matrixPtr->rowVectorProduct( row, *inVectorPtr );
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 325ae861ab985f517879a3aeb4ea4076cedcb003..eb4bfd9f35d86f1800f33917ea6c6baaf14c4f1a 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -65,7 +65,7 @@ template< typename Real,
 bool ChunkedEllpack< Real, Device, Index >::setDimensions( const IndexType rows,
                                                                     const IndexType columns )
 {
-   Assert( rows > 0 && columns > 0,
+   TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
    if( ! Sparse< Real, Device, Index >::setDimensions( rows, columns ) )
@@ -110,7 +110,7 @@ void ChunkedEllpack< Real, Device, Index >::resolveSliceSizes( const Containers:
       row++;
       if( allocatedElementsInSlice < desiredElementsInSlice  )
           if( row < this->rows && sliceSize < chunksInSlice ) continue;
-      Assert( sliceSize >0, );
+      TNL_ASSERT( sliceSize >0, );
       this->slices[ numberOfSlices ].size = sliceSize;
       this->slices[ numberOfSlices ].firstRow = row - sliceSize;
       this->slices[ numberOfSlices ].pointer = allocatedElementsInSlice; // this is only temporary
@@ -173,7 +173,7 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( const CompressedRowsLength
       maxChunkInSlice = max( maxChunkInSlice,
                           ceil( ( RealType ) rowLengths[ i ] /
                                 ( RealType ) this->rowToChunkMapping[ i ] ) );
-   Assert( maxChunkInSlice > 0,
+   TNL_ASSERT( maxChunkInSlice > 0,
               std::cerr << " maxChunkInSlice = " << maxChunkInSlice << std::endl );
 
    /****
@@ -189,9 +189,9 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( const CompressedRowsLength
    for( IndexType i = sliceBegin; i < sliceEnd; i++ )
    {
       this->rowPointers[ i + 1 ] = maxChunkInSlice*rowToChunkMapping[ i ];
-      Assert( this->rowPointers[ i ] >= 0,
+      TNL_ASSERT( this->rowPointers[ i ] >= 0,
                  std::cerr << "this->rowPointers[ i ] = " << this->rowPointers[ i ] );
-      Assert( this->rowPointers[ i + 1 ] >= 0,
+      TNL_ASSERT( this->rowPointers[ i + 1 ] >= 0,
                  std::cerr << "this->rowPointers[ i + 1 ] = " << this->rowPointers[ i + 1 ] );
    }
 
@@ -208,8 +208,8 @@ template< typename Real,
           typename Index >
 bool ChunkedEllpack< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
-   Assert( this->getRows() > 0, );
-   Assert( this->getColumns() > 0, );
+   TNL_ASSERT( this->getRows() > 0, );
+   TNL_ASSERT( this->getColumns() > 0, );
 
    IndexType elementsToAllocation( 0 );
 
@@ -254,7 +254,7 @@ template< typename Real,
 Index ChunkedEllpack< Real, Device, Index >::getRowLength( const IndexType row ) const
 {
    const IndexType& sliceIndex = rowToSliceMapping[ row ];
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    const IndexType& chunkSize = slices.getElement( sliceIndex ).chunkSize;
    return rowPointers[ row + 1 ] - rowPointers[ row ];
 }
@@ -338,7 +338,7 @@ template< typename Real,
              typename Index2 >
 bool ChunkedEllpack< Real, Device, Index >::operator == ( const ChunkedEllpack< Real2, Device2, Index2 >& matrix ) const
 {
-   Assert( this->getRows() == matrix.getRows() &&
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
               this->getColumns() == matrix.getColumns(),
               std::cerr << "this->getRows() = " << this->getRows()
                    << " matrix.getRows() = " << matrix.getRows()
@@ -390,7 +390,7 @@ bool ChunkedEllpack< Real, Device, Index >::addElementFast( const IndexType row,
                                                                      const RealType& _thisElementMultiplicator )
 {
    // TODO: return this back when CUDA kernels support std::cerr
-   /*Assert( row >= 0 && row < this->rows &&
+   /*TNL_ASSERT( row >= 0 && row < this->rows &&
               _column >= 0 && _column <= this->columns,
               std::cerr << " row = " << row
                    << " column = " << _column
@@ -398,7 +398,7 @@ bool ChunkedEllpack< Real, Device, Index >::addElementFast( const IndexType row,
                    << " this->columns = " << this-> columns );*/
 
    const IndexType& sliceIndex = rowToSliceMapping[ row ];
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices[ sliceIndex ].firstRow )
       chunkIndex = rowToChunkMapping[ row - 1 ];
@@ -489,7 +489,7 @@ bool ChunkedEllpack< Real, Device, Index >::addElement( const IndexType row,
                                                                  const RealType& _value,
                                                                  const RealType& _thisElementMultiplicator )
 {
-   Assert( row >= 0 && row < this->rows &&
+   TNL_ASSERT( row >= 0 && row < this->rows &&
               _column >= 0 && _column <= this->columns,
               std::cerr << " row = " << row
                    << " column = " << _column
@@ -497,7 +497,7 @@ bool ChunkedEllpack< Real, Device, Index >::addElement( const IndexType row,
                    << " this->columns = " << this-> columns );
 
    const IndexType& sliceIndex = rowToSliceMapping.getElement( row );
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices.getElement( sliceIndex ).firstRow )
       chunkIndex = rowToChunkMapping.getElement( row - 1 );
@@ -586,11 +586,11 @@ bool ChunkedEllpack< Real, Device, Index >::setRowFast( const IndexType row,
                                                                  const IndexType elements )
 {
    // TODO: return this back when CUDA kernels support std::cerr
-   /*Assert( row >= 0 && row < this->rows,
+   /*TNL_ASSERT( row >= 0 && row < this->rows,
               std::cerr << " row = " << row
                    << " this->rows = " << this->rows );*/
    const IndexType sliceIndex = rowToSliceMapping[ row ];
-   //Assert( sliceIndex < this->rows, );
+   //TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices[ sliceIndex ].firstRow )
       chunkIndex = rowToChunkMapping[ row - 1 ];
@@ -661,12 +661,12 @@ bool ChunkedEllpack< Real, Device, Index >::setRow( const IndexType row,
                                                              const RealType* values,
                                                              const IndexType elements )
 {
-   Assert( row >= 0 && row < this->rows,
+   TNL_ASSERT( row >= 0 && row < this->rows,
               std::cerr << " row = " << row
                    << " this->rows = " << this->rows );
 
    const IndexType sliceIndex = rowToSliceMapping.getElement( row );
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices.getElement( sliceIndex ).firstRow )
       chunkIndex = rowToChunkMapping.getElement( row - 1 );
@@ -762,7 +762,7 @@ Real ChunkedEllpack< Real, Device, Index >::getElementFast( const IndexType row,
                                                                      const IndexType column ) const
 {
    const IndexType sliceIndex = rowToSliceMapping[ row ];
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices[ sliceIndex ].firstRow )
       chunkIndex = rowToChunkMapping[ row - 1 ];
@@ -808,7 +808,7 @@ Real ChunkedEllpack< Real, Device, Index >::getElement( const IndexType row,
                                                                  const IndexType column ) const
 {
    const IndexType& sliceIndex = rowToSliceMapping.getElement( row );
-   Assert( sliceIndex < this->rows,
+   TNL_ASSERT( sliceIndex < this->rows,
               std::cerr << " sliceIndex = " << sliceIndex
                    << " this->rows = " << this->rows << std::endl; );
    IndexType chunkIndex( 0 );
@@ -863,7 +863,7 @@ void ChunkedEllpack< Real, Device, Index >::getRowFast( const IndexType row,
                                                                  RealType* values ) const
 {
    const IndexType& sliceIndex = rowToSliceMapping[ row ];
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices[ sliceIndex ].firstRow )
       chunkIndex = rowToChunkMapping[ row - 1 ];
@@ -947,7 +947,7 @@ void ChunkedEllpack< Real, Device, Index >::getRow( const IndexType row,
                                                              RealType* values ) const
 {
    const IndexType& sliceIndex = rowToSliceMapping.getElement( row );
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices.getElement( sliceIndex ).firstRow )
       chunkIndex = rowToChunkMapping.getElement( row - 1 );
@@ -1003,11 +1003,11 @@ __cuda_callable__
 typename Vector::RealType ChunkedEllpack< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                             const Vector& vector ) const
 {
-   /*Assert( row >=0 && row < this->rows,
+   /*TNL_ASSERT( row >=0 && row < this->rows,
             std::cerr << " row = " << row << " this->rows = " << this->rows );*/
 
    const IndexType sliceIndex = rowToSliceMapping[ row ];
-   //Assert( sliceIndex < this->rows, );
+   //TNL_ASSERT( sliceIndex < this->rows, );
    IndexType chunkIndex( 0 );
    if( row != slices[ sliceIndex ].firstRow )
       chunkIndex = rowToChunkMapping[ row - 1 ];
@@ -1068,7 +1068,7 @@ __device__ void ChunkedEllpack< Real, Device, Index >::computeSliceVectorProduct
 {
    static_assert( std::is_same < DeviceType, Devices::Cuda >::value, "" );
 
-   RealType* chunkProducts = Devices::getSharedMemory< RealType >();
+   RealType* chunkProducts = Devices::Cuda::getSharedMemory< RealType >();
    ChunkedEllpackSliceInfo* sliceInfo = ( ChunkedEllpackSliceInfo* ) & chunkProducts[ blockDim.x ];
 
    if( threadIdx.x == 0 )
@@ -1115,7 +1115,7 @@ void ChunkedEllpack< Real, Device, Index >::addMatrix( const ChunkedEllpack< Rea
                                                                           const RealType& matrixMultiplicator,
                                                                           const RealType& thisMatrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -1127,7 +1127,7 @@ template< typename Real,
 void ChunkedEllpack< Real, Device, Index >::getTransposition( const ChunkedEllpack< Real2, Device, Index2 >& matrix,
                                                                        const RealType& matrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -1140,7 +1140,7 @@ bool ChunkedEllpack< Real, Device, Index >::performSORIteration( const Vector& b
                                                                                     Vector& x,
                                                                                     const RealType& omega ) const
 {
-   Assert( row >=0 && row < this->getRows(),
+   TNL_ASSERT( row >=0 && row < this->getRows(),
               std::cerr << "row = " << row
                    << " this->getRows() = " << this->getRows() << std::endl );
 
@@ -1148,7 +1148,7 @@ bool ChunkedEllpack< Real, Device, Index >::performSORIteration( const Vector& b
    RealType sum( 0.0 );
 
    const IndexType& sliceIndex = rowToSliceMapping[ row ];
-   Assert( sliceIndex < this->rows, );
+   TNL_ASSERT( sliceIndex < this->rows, );
    const IndexType& chunkSize = slices.getElement( sliceIndex ).chunkSize;
    IndexType elementPtr = rowPointers[ row ];
    const IndexType rowEnd = rowPointers[ row + 1 ];
@@ -1225,7 +1225,7 @@ void ChunkedEllpack< Real, Device, Index >::print( std::ostream& str ) const
       str <<"Row: " << row << " -> ";
 
       const IndexType& sliceIndex = rowToSliceMapping.getElement( row );
-      //Assert( sliceIndex < this->rows, );
+      //TNL_ASSERT( sliceIndex < this->rows, );
       const IndexType& chunkSize = slices.getElement( sliceIndex ).chunkSize;
       IndexType elementPtr = rowPointers.getElement( row );
       const IndexType rowEnd = rowPointers.getElement( row + 1 );
diff --git a/src/TNL/Matrices/DenseRow_impl.h b/src/TNL/Matrices/DenseRow_impl.h
index f49b3ff58804c575132224636cd7d9e5bd63b25e..7b1bac1a5bdc5074b5f22b4d3b5d86046e605011 100644
--- a/src/TNL/Matrices/DenseRow_impl.h
+++ b/src/TNL/Matrices/DenseRow_impl.h
@@ -56,9 +56,9 @@ setElement( const Index& elementIndex,
             const Index& column,
             const Real& value )
 {
-   Assert( this->values, );
-   Assert( this->step > 0,);
-   Assert( column >= 0 && column < this->columns,
+   TNL_ASSERT( this->values, );
+   TNL_ASSERT( this->step > 0,);
+   TNL_ASSERT( column >= 0 && column < this->columns,
               std::cerr << "column = " << column << " this->columns = " << this->columns );
 
    this->values[ column * this->step ] = value;
diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h
index 5f44c82bfbf1f9a1d0b12795b9386f7f0c812ef0..783e3b6035a8120c2525da5a9832dd86ecef28ae 100644
--- a/src/TNL/Matrices/Dense_impl.h
+++ b/src/TNL/Matrices/Dense_impl.h
@@ -141,7 +141,7 @@ bool Dense< Real, Device, Index >::setElementFast( const IndexType row,
                                                             const IndexType column,
                                                             const RealType& value )
 {
-   Assert( row >= 0 && row < this->getRows() &&
+   TNL_ASSERT( row >= 0 && row < this->getRows() &&
               column >= 0 && column < this->getColumns(),
               std::cerr << " row = " << row << " column = " << column << " this->getRows() = " << this->getRows()
                    << " this->getColumns() = " << this->getColumns() );
@@ -170,7 +170,7 @@ bool Dense< Real, Device, Index >::addElementFast( const IndexType row,
                                                             const RealType& value,
                                                             const RealType& thisElementMultiplicator )
 {
-   Assert( row >= 0 && row < this->getRows() &&
+   TNL_ASSERT( row >= 0 && row < this->getRows() &&
               column >= 0 && column < this->getColumns(),
               printf( " row = %d, column = %d, this->getRows = %d, this->getColumns() = %d \n", row, column, this->getRows(), this->getColumns() ) );
    const IndexType elementIndex = this->getElementIndex( row, column );
@@ -210,7 +210,7 @@ bool Dense< Real, Device, Index >::setRowFast( const IndexType row,
                                                         const RealType* values,
                                                         const IndexType elements )
 {
-   Assert( elements <= this->getColumns(),
+   TNL_ASSERT( elements <= this->getColumns(),
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->getColumns() );
    for( IndexType i = 0; i < elements; i++ )
@@ -226,7 +226,7 @@ bool Dense< Real, Device, Index >::setRow( const IndexType row,
                                                     const RealType* values,
                                                     const IndexType elements )
 {
-   Assert( elements <= this->getColumns(),
+   TNL_ASSERT( elements <= this->getColumns(),
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->getColumns() );
    for( IndexType i = 0; i < elements; i++ )
@@ -244,7 +244,7 @@ bool Dense< Real, Device, Index >::addRowFast( const IndexType row,
                                                         const IndexType elements,
                                                         const RealType& thisRowMultiplicator )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    for( IndexType i = 0; i < elements; i++ )
@@ -262,7 +262,7 @@ bool Dense< Real, Device, Index >::addRow( const IndexType row,
                                                     const IndexType elements,
                                                     const RealType& thisRowMultiplicator )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    for( IndexType i = 0; i < elements; i++ )
@@ -279,7 +279,7 @@ __cuda_callable__
 const Real& Dense< Real, Device, Index >::getElementFast( const IndexType row,
                                                             const IndexType column ) const
 {
-   Assert( row >= 0 && row < this->getRows() &&
+   TNL_ASSERT( row >= 0 && row < this->getRows() &&
               column >= 0 && column < this->getColumns(),
               printf( " row = %d, column = %d, this->getRows = %d, this->getColumns() = %d \n", row, column, this->getRows(), this->getColumns() ) );
    return this->values.operator[]( this->getElementIndex( row, column ) );
@@ -367,10 +367,10 @@ template< typename Real,
 void Dense< Real, Device, Index >::vectorProduct( const InVector& inVector,
                                                            OutVector& outVector ) const
 {
-   Assert( this->getColumns() == inVector.getSize(),
+   TNL_ASSERT( this->getColumns() == inVector.getSize(),
             std::cerr << "Matrix columns: " << this->getColumns() << std::endl
                  << "Vector size: " << inVector.getSize() << std::endl );
-   Assert( this->getRows() == outVector.getSize(),
+   TNL_ASSERT( this->getRows() == outVector.getSize(),
                std::cerr << "Matrix rows: " << this->getRows() << std::endl
                     << "Vector size: " << outVector.getSize() << std::endl );
 
@@ -385,7 +385,7 @@ void Dense< Real, Device, Index >::addMatrix( const Matrix& matrix,
                                                        const RealType& matrixMultiplicator,
                                                        const RealType& thisMatrixMultiplicator )
 {
-   Assert( this->getColumns() == matrix.getColumns() &&
+   TNL_ASSERT( this->getColumns() == matrix.getColumns() &&
               this->getRows() == matrix.getRows(),
             std::cerr << "This matrix columns: " << this->getColumns() << std::endl
                  << "This matrix rows: " << this->getRows() << std::endl
@@ -506,7 +506,7 @@ void Dense< Real, Device, Index >::getMatrixProduct( const Matrix1& matrix1,
                                                               const RealType& matrix1Multiplicator,
                                                               const RealType& matrix2Multiplicator )
 {
-   Assert( matrix1.getColumns() == matrix2.getRows() &&
+   TNL_ASSERT( matrix1.getColumns() == matrix2.getRows() &&
               this->getRows() == matrix1.getRows() &&
               this->getColumns() == matrix2.getColumns(),
             std::cerr << "This matrix columns: " << this->getColumns() << std::endl
@@ -742,7 +742,7 @@ template< typename Real,
 void Dense< Real, Device, Index >::getTransposition( const Matrix& matrix,
                                                               const RealType& matrixMultiplicator )
 {
-   Assert( this->getColumns() == matrix.getRows() &&
+   TNL_ASSERT( this->getColumns() == matrix.getRows() &&
               this->getRows() == matrix.getColumns(),
                std::cerr << "This matrix columns: " << this->getColumns() << std::endl
                     << "This matrix rows: " << this->getRows() << std::endl
@@ -903,7 +903,7 @@ __cuda_callable__
 Index Dense< Real, Device, Index >::getElementIndex( const IndexType row,
                                                               const IndexType column ) const
 {
-   Assert( ( std::is_same< Device, Devices::Host >::value ||
+   TNL_ASSERT( ( std::is_same< Device, Devices::Host >::value ||
           std::is_same< Device, Devices::Cuda >::value ), )
    if( std::is_same< Device, Devices::Host >::value )
       return row * this->columns + column;
diff --git a/src/TNL/Matrices/Ellpack.h b/src/TNL/Matrices/Ellpack.h
index f7d0b9cca03beb9c60e5d5c860e76789d1f97c07..64bb18123d1011cb0678c62bcf57936d98617aed 100644
--- a/src/TNL/Matrices/Ellpack.h
+++ b/src/TNL/Matrices/Ellpack.h
@@ -35,6 +35,7 @@ class Ellpack : public Sparse< Real, Device, Index >
    typedef Ellpack< Real, Devices::Cuda, Index > CudaType;
    typedef Sparse< Real, Device, Index > BaseType;
    typedef typename BaseType::MatrixRow MatrixRow;
+   typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
 
    Ellpack();
 
@@ -128,7 +129,7 @@ class Ellpack : public Sparse< Real, Device, Index >
    MatrixRow getRow( const IndexType rowIndex );
 
    __cuda_callable__
-   const MatrixRow getRow( const IndexType rowIndex ) const;
+   ConstMatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
    __cuda_callable__
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index beab1b135d585893f8f4bbbe3b7a0ef383e8faec..c183b4777368e49a30b9b8ba5e6efa97e52c2591 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -53,7 +53,7 @@ template< typename Real,
 bool Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
                                                              const IndexType columns )
 {
-   Assert( rows > 0 && columns > 0,
+   TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
    this->rows = rows;
@@ -71,9 +71,9 @@ template< typename Real,
           typename Index >
 bool Ellpack< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
-   Assert( this->getRows() > 0, );
-   Assert( this->getColumns() > 0, );
-   Assert( rowLengths.getSize() > 0, );
+   TNL_ASSERT( this->getRows() > 0, );
+   TNL_ASSERT( this->getColumns() > 0, );
+   TNL_ASSERT( rowLengths.getSize() > 0, );
    this->rowLengths = this->maxRowLength = rowLengths.max();
    return allocateElements();
 }
@@ -83,7 +83,7 @@ template< typename Real,
           typename Index >
 bool Ellpack< Real, Device, Index >::setConstantCompressedRowsLengths( const IndexType& rowLengths )
 {
-   Assert( rowLengths > 0,
+   TNL_ASSERT( rowLengths > 0,
               std::cerr << " rowLengths = " << rowLengths );
    this->rowLengths = rowLengths;
    if( this->rows > 0 )
@@ -132,7 +132,7 @@ template< typename Real,
              typename Index2 >
 bool Ellpack< Real, Device, Index >::operator == ( const Ellpack< Real2, Device2, Index2 >& matrix ) const
 {
-   Assert( this->getRows() == matrix.getRows() &&
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
               this->getColumns() == matrix.getColumns(),
               std::cerr << "this->getRows() = " << this->getRows()
                    << " matrix.getRows() = " << matrix.getRows()
@@ -195,7 +195,7 @@ bool Ellpack< Real, Device, Index > :: addElementFast( const IndexType row,
                                                                 const RealType& thisElementMultiplicator )
 {
    // TODO: return this back when CUDA kernels support std::cerr
-   /*Assert( row >= 0 && row < this->rows &&
+   /*TNL_ASSERT( row >= 0 && row < this->rows &&
               column >= 0 && column <= this->rows,
               std::cerr << " row = " << row
                    << " column = " << column
@@ -455,16 +455,16 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__
-const typename Ellpack< Real, Device, Index >::MatrixRow
+typename Ellpack< Real, Device, Index >::ConstMatrixRow
 Ellpack< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
 {
    //printf( "this->rowLengths = %d this = %p \n", this->rowLengths, this );
    IndexType rowBegin = DeviceDependentCode::getRowBegin( *this, rowIndex );
-   return MatrixRow( &this->columnIndexes[ rowBegin ],
-                     &this->values[ rowBegin ],
-                     this->rowLengths,
-                     DeviceDependentCode::getElementStep( *this ) );
+   return ConstMatrixRow( &this->columnIndexes[ rowBegin ],
+                          &this->values[ rowBegin ],
+                          this->rowLengths,
+                          DeviceDependentCode::getElementStep( *this ) );
 }
 
 template< typename Real,
@@ -509,7 +509,7 @@ void Ellpack< Real, Device, Index > :: addMatrix( const Ellpack< Real2, Device,
                                                                  const RealType& matrixMultiplicator,
                                                                  const RealType& thisMatrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -521,7 +521,7 @@ template< typename Real,
 void Ellpack< Real, Device, Index >::getTransposition( const Ellpack< Real2, Device, Index2 >& matrix,
                                                                       const RealType& matrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -534,7 +534,7 @@ bool Ellpack< Real, Device, Index > :: performSORIteration( const Vector& b,
                                                                            Vector& x,
                                                                            const RealType& omega ) const
 {
-   Assert( row >=0 && row < this->getRows(),
+   TNL_ASSERT( row >=0 && row < this->getRows(),
               std::cerr << "row = " << row
                    << " this->getRows() = " << this->getRows() << std::endl );
 
diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h
index 4cecc842596b3cbb0e7913c174d6e6794b159fbd..31e72fb24288a2fa799a536fef52b3a3079090a6 100644
--- a/src/TNL/Matrices/MatrixOperations.h
+++ b/src/TNL/Matrices/MatrixOperations.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          MatrixOperations.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 /*
@@ -37,7 +49,7 @@ public:
          const RealType& beta,
          RealType* y )
    {
-      Assert( m <= lda, );
+      TNL_ASSERT( m <= lda, );
 
       if( beta != 0.0 ) {
 #ifdef HAVE_OPENMP
@@ -90,16 +102,7 @@ GemvCudaKernel( const IndexType m,
    IndexType elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   // NOTE: Plain declaration such as
-   //          extern __shared__ RealType shx[];
-   //       won't work because extern variables must be declared exactly once.
-   //       In templated functions we need to have same variable name with
-   //       different type, which causes the conflict. In CUDA samples they
-   //       solve it using template specialization via classes, but using char
-   //       as the base type and reinterpret_cast works too.
-   //       See http://stackoverflow.com/a/19339004/4180822
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-   RealType* shx = reinterpret_cast< RealType* >( __sdata );
+   RealType* shx = Devices::Cuda::getSharedMemory< RealType >();
 
    if( threadIdx.x < n )
       shx[ threadIdx.x ] = x[ threadIdx.x ];
@@ -154,14 +157,14 @@ public:
          const RealType& beta,
          RealType* y )
    {
-      Assert( m <= lda, );
-      Assert( n <= 256,
+      TNL_ASSERT( m <= lda, );
+      TNL_ASSERT( n <= 256,
               std::cerr << "The gemv kernel is optimized only for small 'n' and assumes that n <= 256." << std::endl; );
 
 #ifdef HAVE_CUDA
       Containers::Vector< RealType, Devices::Cuda, IndexType > xDevice;
       xDevice.setSize( n );
-      if( ! Containers::ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< RealType, RealType, IndexType >( xDevice.getData(), x, n ) )
+      if( ! Containers::Algorithms::ArrayOperations< Devices::Cuda, Devices::Host >::copyMemory< RealType, RealType, IndexType >( xDevice.getData(), x, n ) )
          throw 1;
 
       dim3 blockSize( 256 );
diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h
index f7ada94abcdd9d7d7867604f38731f32b59cb7bb..87ddf93d3b7bfa665d5f0e441c6c74a99294857d 100644
--- a/src/TNL/Matrices/MatrixReader_impl.h
+++ b/src/TNL/Matrices/MatrixReader_impl.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <iomanip>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/String.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Timer.h>
@@ -162,7 +162,7 @@ template< typename Matrix >
 bool MatrixReader< Matrix >::checkMtxHeader( const String& header,
                                                 bool& symmetric )
 {
-   List< String > parsedLine;
+   Containers::List< String > parsedLine;
    header.parse( parsedLine );
    if( parsedLine.getSize() < 5 )
       return false;
@@ -208,7 +208,7 @@ bool MatrixReader< Matrix >::readMtxHeader( std::istream& file,
    file.seekg( 0, std::ios::beg );
    String line;
    bool headerParsed( false );
-   List< String > parsedLine;
+   Containers::List< String > parsedLine;
    while( true )
    {
       line.getLine( file );
@@ -364,7 +364,7 @@ bool MatrixReader< Matrix >::parseMtxLineWithElement( const String& line,
                                                          IndexType& column,
                                                          RealType& value )
 {
-   List< String > parsedLine;
+   Containers::List< String > parsedLine;
    line.parse( parsedLine );
    if( parsedLine.getSize() != 3 )
    {
diff --git a/src/TNL/Matrices/MatrixWriter.h b/src/TNL/Matrices/MatrixWriter.h
index 5d2be313a4b900272acc3382b54fd35f2ad24572..634a3437b9e6da626a1e1ae47930126c21e2cf0a 100644
--- a/src/TNL/Matrices/MatrixWriter.h
+++ b/src/TNL/Matrices/MatrixWriter.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <ostream>
+#include <iostream>
 
 namespace TNL {
 namespace Matrices {   
@@ -23,26 +24,27 @@ class MatrixWriter
    typedef typename Matrix::IndexType IndexType;
    typedef typename Matrix::RealType RealType;
 
-   static bool writeToGnuplot( std::ostream str,
+   static bool writeToGnuplot( std::ostream& str,
                                const Matrix& matrix,
                                bool verbose = false );
 
-   static bool writeToEps( std::ostream str,
+   static bool writeToEps( std::ostream& str,
                            const Matrix& matrix,
                            bool verbose = false );
 
    protected:
 
-   static bool writeEpsHeader( std::ostream str,
+   static bool writeEpsHeader( std::ostream& str,
                                const Matrix& matrix,
                                const int elementSize );
 
-   static bool writeEpsBody( std::ostream str,
+   static bool writeEpsBody( std::ostream& str,
                              const Matrix& matrix,
-                             const int elementSize );
-
-
+                             const int elementSize,
+                             bool verbose );
 };
 
 } // namespace Matrices
 } // namespace TNL
+
+#include <TNL/Matrices/MatrixWriter_impl.h>
diff --git a/src/TNL/Matrices/MatrixWriter_impl.h b/src/TNL/Matrices/MatrixWriter_impl.h
index 9f36993251f073ad99ecb1e8dc2166cd811ea276..1db1a9df9e63435e9052aa65f4d35527038278bc 100644
--- a/src/TNL/Matrices/MatrixWriter_impl.h
+++ b/src/TNL/Matrices/MatrixWriter_impl.h
@@ -10,21 +10,23 @@
 
 #pragma once
 
+#include <TNL/Matrices/MatrixWriter.h>
+
 namespace TNL {
 namespace Matrices {   
 
 template< typename Matrix >
-bool MatrixWriter< Matrix >::writeToGnuplot( std::ostream str,
-                                                const Matrix& matrix,
-                                                bool verbose )
+bool MatrixWriter< Matrix >::writeToGnuplot( std::ostream& str,
+                                             const Matrix& matrix,
+                                             bool verbose )
 {
    for( IndexType row = 0; row < matrix.getRows(); row ++ )
    {
       for( IndexType column = 0; column < matrix.getColumns(); column ++ )
       {
-         RealType elementValue = maytrix.getElement( row, column );
+         RealType elementValue = matrix.getElement( row, column );
          if(  elementValue != ( RealType ) 0.0 )
-            str << column << " " << getSize() - row << " " << elementValue << std::endl;
+            str << column << " " << row << " " << elementValue << std::endl;
       }
       if( verbose )
         std::cout << "Drawing the row " << row << "      \r" << std::flush;
@@ -35,9 +37,9 @@ bool MatrixWriter< Matrix >::writeToGnuplot( std::ostream str,
 }
 
 template< typename Matrix >
-bool MatrixWriter< Matrix >::writeToEps( std::ostream str,
-                                            const Matrix& matrix,
-                                            bool verbose )
+bool MatrixWriter< Matrix >::writeToEps( std::ostream& str,
+                                         const Matrix& matrix,
+                                         bool verbose )
 {
    const int elementSize = 10;
    if( ! writeEpsHeader( str, matrix, elementSize ) )
@@ -54,9 +56,9 @@ bool MatrixWriter< Matrix >::writeToEps( std::ostream str,
 }
 
 template< typename Matrix >
-bool MatrixWriter< Matrix >::writeEpsHeader( std::ostream str,
-                                                const Marix& matrix,
-                                                const int elementSize )
+bool MatrixWriter< Matrix >::writeEpsHeader( std::ostream& str,
+                                             const Matrix& matrix,
+                                             const int elementSize )
 {
    const double scale = elementSize * max( matrix.getRows(), matrix.getColumns() );
    str << "%!PS-Adobe-2.0 EPSF-2.0" << std::endl;
@@ -69,14 +71,15 @@ bool MatrixWriter< Matrix >::writeEpsHeader( std::ostream str,
 }
 
 template< typename Matrix >
-bool MatrixWriter< Matrix >::writeEpsBody( std::ostream str,
-                                              const Marix& matrix,
-                                              const int elementSize )
+bool MatrixWriter< Matrix >::writeEpsBody( std::ostream& str,
+                                           const Matrix& matrix,
+                                           const int elementSize,
+                                           bool verbose )
 {
    IndexType lastRow( 0 ), lastColumn( 0 );
-   for( IndexType row = 0; row < getSize(); row ++ )
+   for( IndexType row = 0; row < matrix.getRows(); row ++ )
    {
-      for( IndexType column = 0; column < getSize(); column ++ )
+      for( IndexType column = 0; column < matrix.getColumns(); column ++ )
       {
          RealType elementValue = getElement( row, column );
          if( elementValue != ( RealType ) 0.0 )
diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h
index f4dc2a12d997e2eac4abb8d893b437d5e5cf08b0..19e40ae200eaabb1c27ca2e4054b3c23c6c726d6 100644
--- a/src/TNL/Matrices/Matrix_impl.h
+++ b/src/TNL/Matrices/Matrix_impl.h
@@ -31,7 +31,7 @@ template< typename Real,
  bool Matrix< Real, Device, Index >::setDimensions( const IndexType rows,
                                                        const IndexType columns )
 {
-   Assert( rows > 0 && columns > 0,
+   TNL_ASSERT( rows > 0 && columns > 0,
             std::cerr << " rows = " << rows << " columns = " << columns );
    this->rows = rows;
    this->columns = columns;
@@ -93,8 +93,8 @@ template< typename Real,
 bool Matrix< Real, Device, Index >::copyFrom( const MatrixT& matrix,
                                               const CompressedRowsLengthsVector& rowLengths )
 {
-   /*tnlStaticAssert( DeviceType::DeviceType == Devices::HostDevice, );
-   tnlStaticAssert( DeviceType::DeviceType == Matrix:DeviceType::DeviceType, );*/
+   /*tnlStaticTNL_ASSERT( DeviceType::DeviceType == Devices::HostDevice, );
+   tnlStaticTNL_ASSERT( DeviceType::DeviceType == Matrix:DeviceType::DeviceType, );*/
 
    this->setLike( matrix );
    if( ! this->setCompressedRowsLengths( rowLengths ) )
@@ -106,7 +106,7 @@ bool Matrix< Real, Device, Index >::copyFrom( const MatrixT& matrix,
       return false;
    for( IndexType row = 0; row < this->getRows(); row++ )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
       // TODO: fix this
       //matrix.getRow( row, columns.getData(), values.getData() );
       this->setRow( row, columns.getData(), values.getData(), rowLengths.getElement( row ) );
diff --git a/src/TNL/Matrices/MultidiagonalRow_impl.h b/src/TNL/Matrices/MultidiagonalRow_impl.h
index 6fe47d4decf68929e2c2abd48d7c1ba23ab32d56..2765188c1a18fc028e27b455ca3a9986c23409f1 100644
--- a/src/TNL/Matrices/MultidiagonalRow_impl.h
+++ b/src/TNL/Matrices/MultidiagonalRow_impl.h
@@ -71,16 +71,16 @@ setElement( const Index& elementIndex,
             const Index& column,
             const Real& value )
 {
-   Assert( this->values, );
-   Assert( this->step > 0,);
-   Assert( column >= 0 && column < this->columns,
+   TNL_ASSERT( this->values, );
+   TNL_ASSERT( this->step > 0,);
+   TNL_ASSERT( column >= 0 && column < this->columns,
               std::cerr << "column = " << columns << " this->columns = " << this->columns );
-   Assert( elementIndex >= 0 && elementIndex < this->maxRowLength,
+   TNL_ASSERT( elementIndex >= 0 && elementIndex < this->maxRowLength,
               std::cerr << "elementIndex = " << elementIndex << " this->maxRowLength =  " << this->maxRowLength );
 
    Index aux = elementIndex;
    while( row + this->diagonals[ aux ] < column ) aux++;
-   Assert( row + this->diagonals[ aux ] == column,
+   TNL_ASSERT( row + this->diagonals[ aux ] == column,
               std::cerr << "row = " << row
                    << " aux = " << aux
                    << " this->diagonals[ aux ] = " << this->diagonals[ aux]
diff --git a/src/TNL/Matrices/Multidiagonal_impl.h b/src/TNL/Matrices/Multidiagonal_impl.h
index 1e3463657ee9084699622fc5ce511a04f0d8441d..0e2aed933a19a0cf6e539e8179ac9544be4fa2ab 100644
--- a/src/TNL/Matrices/Multidiagonal_impl.h
+++ b/src/TNL/Matrices/Multidiagonal_impl.h
@@ -53,7 +53,7 @@ template< typename Real,
 bool Multidiagonal< Real, Device, Index >::setDimensions( const IndexType rows,
                                                                    const IndexType columns )
 {
-   Assert( rows > 0 && columns > 0,
+   TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
    if( ! Matrix< Real, Device, Index >::setDimensions( rows, columns ) )
@@ -109,7 +109,7 @@ template< typename Real,
    template< typename Vector >
 bool Multidiagonal< Real, Device, Index > :: setDiagonals(  const Vector& diagonals )
 {
-   Assert( diagonals.getSize() > 0,
+   TNL_ASSERT( diagonals.getSize() > 0,
               std::cerr << "New number of diagonals = " << diagonals.getSize() << std::endl );
    this->diagonalsShift.setLike( diagonals );
    this->diagonalsShift = diagonals;
@@ -183,7 +183,7 @@ template< typename Real,
              typename Index2 >
 bool Multidiagonal< Real, Device, Index >::operator == ( const Multidiagonal< Real2, Device2, Index2 >& matrix ) const
 {
-   Assert( this->getRows() == matrix.getRows() &&
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
               this->getColumns() == matrix.getColumns(),
               std::cerr << "this->getRows() = " << this->getRows()
                    << " matrix.getRows() = " << matrix.getRows()
@@ -503,10 +503,10 @@ template< typename Real,
 void Multidiagonal< Real, Device, Index >::vectorProduct( const InVector& inVector,
                                                                    OutVector& outVector ) const
 {
-   Assert( this->getColumns() == inVector.getSize(),
+   TNL_ASSERT( this->getColumns() == inVector.getSize(),
             std::cerr << "Matrix columns: " << this->getColumns() << std::endl
                  << "Vector size: " << inVector.getSize() << std::endl );
-   Assert( this->getRows() == outVector.getSize(),
+   TNL_ASSERT( this->getRows() == outVector.getSize(),
                std::cerr << "Matrix rows: " << this->getRows() << std::endl
                     << "Vector size: " << outVector.getSize() << std::endl );
 
@@ -522,7 +522,7 @@ void Multidiagonal< Real, Device, Index > :: addMatrix( const Multidiagonal< Rea
                                                                  const RealType& matrixMultiplicator,
                                                                  const RealType& thisMatrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
 }
 
 template< typename Real,
@@ -559,7 +559,7 @@ bool Multidiagonal< Real, Device, Index > :: performSORIteration( const Vector&
                                                                            Vector& x,
                                                                            const RealType& omega ) const
 {
-   Assert( row >=0 && row < this->getRows(),
+   TNL_ASSERT( row >=0 && row < this->getRows(),
               std::cerr << "row = " << row
                    << " this->getRows() = " << this->getRows() << std::endl );
 
@@ -654,10 +654,10 @@ bool Multidiagonal< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                      const IndexType column,
                                                                      Index& index ) const
 {
-   Assert( row >=0 && row < this->rows,
+   TNL_ASSERT( row >=0 && row < this->rows,
             std::cerr << "row = " << row
                  << " this->rows = " << this->rows << std::endl );
-   Assert( column >=0 && column < this->columns,
+   TNL_ASSERT( column >=0 && column < this->columns,
             std::cerr << "column = " << column
                  << " this->columns = " << this->columns << std::endl );
 
@@ -683,10 +683,10 @@ bool Multidiagonal< Real, Device, Index >::getElementIndexFast( const IndexType
                                                                          const IndexType column,
                                                                          Index& index ) const
 {
-   Assert( row >=0 && row < this->rows,
+   TNL_ASSERT( row >=0 && row < this->rows,
             std::cerr << "row = " << row
                  << " this->rows = " << this->rows << std::endl );
-   Assert( column >=0 && column < this->columns,
+   TNL_ASSERT( column >=0 && column < this->columns,
             std::cerr << "column = " << column
                  << " this->columns = " << this->columns << std::endl );
 
diff --git a/src/TNL/Matrices/SlicedEllpack.h b/src/TNL/Matrices/SlicedEllpack.h
index a496f544ebad6d77f8420425a821e777c9f25612..cb1395a089cd412ec63bb945366ed1ba8fb857c6 100644
--- a/src/TNL/Matrices/SlicedEllpack.h
+++ b/src/TNL/Matrices/SlicedEllpack.h
@@ -64,6 +64,7 @@ class SlicedEllpack : public Sparse< Real, Device, Index >
    typedef SlicedEllpack< Real, Devices::Cuda, Index > CudaType;
    typedef Sparse< Real, Device, Index > BaseType;
    typedef typename BaseType::MatrixRow MatrixRow;
+   typedef SparseRow< const RealType, const IndexType > ConstMatrixRow;
 
 
    SlicedEllpack();
@@ -150,7 +151,7 @@ class SlicedEllpack : public Sparse< Real, Device, Index >
    MatrixRow getRow( const IndexType rowIndex );
 
    __cuda_callable__
-   const MatrixRow getRow( const IndexType rowIndex ) const;
+   ConstMatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
    __cuda_callable__
diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h
index d15c38c12118240c42fd9f3aa0a743c78cbe9197..d80d9f47982068b39fd1305435bd3714637d3cdc 100644
--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -54,7 +54,7 @@ template< typename Real,
 bool SlicedEllpack< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows,
                                                                               const IndexType columns )
 {
-   Assert( rows > 0 && columns > 0,
+   TNL_ASSERT( rows > 0 && columns > 0,
               std::cerr << "rows = " << rows
                    << " columns = " << columns << std::endl );
    return Sparse< Real, Device, Index >::setDimensions( rows, columns );
@@ -66,8 +66,8 @@ template< typename Real,
           int SliceSize >
 bool SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
-   Assert( this->getRows() > 0, );
-   Assert( this->getColumns() > 0, );
+   TNL_ASSERT( this->getRows() > 0, );
+   TNL_ASSERT( this->getColumns() > 0, );
    const IndexType slices = roundUpDivision( this->rows, SliceSize );
    if( ! this->sliceCompressedRowsLengths.setSize( slices ) ||
        ! this->slicePointers.setSize( slices + 1 ) )
@@ -127,7 +127,7 @@ template< typename Real,
              typename Index2 >
 bool SlicedEllpack< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpack< Real2, Device2, Index2 >& matrix ) const
 {
-   Assert( this->getRows() == matrix.getRows() &&
+   TNL_ASSERT( this->getRows() == matrix.getRows() &&
               this->getColumns() == matrix.getColumns(),
               std::cerr << "this->getRows() = " << this->getRows()
                    << " matrix.getRows() = " << matrix.getRows()
@@ -183,7 +183,7 @@ bool SlicedEllpack< Real, Device, Index, SliceSize >::addElementFast( const Inde
                                                                                const RealType& value,
                                                                                const RealType& thisElementMultiplicator )
 {
-   Assert( row >= 0 && row < this->rows &&
+   TNL_ASSERT( row >= 0 && row < this->rows &&
               column >= 0 && column <= this->rows,
               std::cerr << " row = " << row
                    << " column = " << column
@@ -231,7 +231,7 @@ bool SlicedEllpack< Real, Device, Index, SliceSize >::addElement( const IndexTyp
                                                                            const RealType& value,
                                                                            const RealType& thisElementMultiplicator )
 {
-   Assert( row >= 0 && row < this->rows &&
+   TNL_ASSERT( row >= 0 && row < this->rows &&
               column >= 0 && column <= this->rows,
               std::cerr << " row = " << row
                    << " column = " << column
@@ -454,17 +454,17 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __cuda_callable__
-const typename SlicedEllpack< Real, Device, Index, SliceSize >::MatrixRow
+typename SlicedEllpack< Real, Device, Index, SliceSize >::ConstMatrixRow
 SlicedEllpack< Real, Device, Index, SliceSize >::
 getRow( const IndexType rowIndex ) const
 {
    Index rowBegin, rowEnd, step;
    DeviceDependentCode::initRowTraverseFast( *this, rowIndex, rowBegin, rowEnd, step );
    const IndexType slice = rowIndex / SliceSize;
-   return MatrixRow( &this->columnIndexes[ rowBegin ],
-                     &this->values[ rowBegin ],
-                     this->sliceCompressedRowsLengths[ slice ],
-                     step );
+   return ConstMatrixRow( &this->columnIndexes[ rowBegin ],
+                          &this->values[ rowBegin ],
+                          this->sliceCompressedRowsLengths[ slice ],
+                          step );
 }
 
 template< typename Real,
@@ -513,7 +513,7 @@ void SlicedEllpack< Real, Device, Index, SliceSize >::addMatrix( const SlicedEll
                                                                           const RealType& matrixMultiplicator,
                                                                           const RealType& thisMatrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -526,7 +526,7 @@ template< typename Real,
 void SlicedEllpack< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpack< Real2, Device, Index2 >& matrix,
                                                                       const RealType& matrixMultiplicator )
 {
-   Assert( false, std::cerr << "TODO: implement" );
+   TNL_ASSERT( false, std::cerr << "TODO: implement" );
    // TODO: implement
 }
 
@@ -540,7 +540,7 @@ bool SlicedEllpack< Real, Device, Index, SliceSize >::performSORIteration( const
                                                                                     Vector& x,
                                                                                     const RealType& omega ) const
 {
-   Assert( row >=0 && row < this->getRows(),
+   TNL_ASSERT( row >=0 && row < this->getRows(),
               std::cerr << "row = " << row
                    << " this->getRows() = " << this->getRows() << std::endl );
 
diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h
index 8c4027532a4fd810050f80c4ffcbcb4e9cea60b6..c3a20762f22d3e580935872c964e5e87e0ad5333 100644
--- a/src/TNL/Matrices/SparseRow.h
+++ b/src/TNL/Matrices/SparseRow.h
@@ -11,6 +11,10 @@
 
 #pragma once
 
+#include <type_traits>
+
+#include <TNL/Devices/Cuda.h>
+
 namespace TNL {
 namespace Matrices {   
 
@@ -39,6 +43,12 @@ class SparseRow
                        const Index& column,
                        const Real& value );
  
+      __cuda_callable__
+      const Index& getElementColumn( const Index& elementIndex ) const;
+ 
+      __cuda_callable__
+      const Real& getElementValue( const Index& elementIndex ) const;
+ 
       void print( std::ostream& str ) const;
 
    protected:
diff --git a/src/TNL/Matrices/SparseRow_impl.h b/src/TNL/Matrices/SparseRow_impl.h
index 414ba822309418e5fbc2c216b957b59dcb20e15a..c4b69044bcd27268b3e6df8b53ab71ab379c9349 100644
--- a/src/TNL/Matrices/SparseRow_impl.h
+++ b/src/TNL/Matrices/SparseRow_impl.h
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include <TNL/Matrices/SparseRow.h>
+
 namespace TNL {
 namespace Matrices {   
 
@@ -61,24 +63,49 @@ setElement( const Index& elementIndex,
             const Index& column,
             const Real& value )
 {
-   Assert( this->columns, );
-   Assert( this->values, );
-   Assert( this->step > 0,);
+   TNL_ASSERT( this->columns, );
+   TNL_ASSERT( this->values, );
+   TNL_ASSERT( this->step > 0,);
    //printf( "elementIndex = %d length = %d \n", elementIndex, this->length );
-   Assert( elementIndex >= 0 && elementIndex < this->length,
+   TNL_ASSERT( elementIndex >= 0 && elementIndex < this->length,
               std::cerr << "elementIndex = " << elementIndex << " this->length = " << this->length );
 
    this->columns[ elementIndex * step ] = column;
    this->values[ elementIndex * step ] = value;
 }
 
+template< typename Real, typename Index >
+__cuda_callable__
+const Index&
+SparseRow< Real, Index >::
+getElementColumn( const Index& elementIndex ) const
+{
+   TNL_ASSERT( elementIndex >= 0 && elementIndex < this->length,
+              std::cerr << "elementIndex = " << elementIndex << " this->length = " << this->length );
+
+   return this->columns[ elementIndex * step ];
+}
+
+template< typename Real, typename Index >
+__cuda_callable__
+const Real&
+SparseRow< Real, Index >::
+getElementValue( const Index& elementIndex ) const
+{
+   TNL_ASSERT( elementIndex >= 0 && elementIndex < this->length,
+              std::cerr << "elementIndex = " << elementIndex << " this->length = " << this->length );
+
+   return this->values[ elementIndex * step ];
+}
+
 template< typename Real, typename Index >
 void
 SparseRow< Real, Index >::
 print( std::ostream& str ) const
 {
-   Index pos( 0 );
-   for( Index i = 0; i < length; i++ )
+   using NonConstIndex = typename std::remove_const< Index >::type;
+   NonConstIndex pos( 0 );
+   for( NonConstIndex i = 0; i < length; i++ )
    {
       str << " [ " << columns[ pos ] << " ] = " << values[ pos ] << ", ";
       pos += step;
diff --git a/src/TNL/Matrices/TridiagonalRow_impl.h b/src/TNL/Matrices/TridiagonalRow_impl.h
index 09d173342f5578c867569c58b1688c5425ac572f..f5b7e842a4c4b69c77aa11f2ee09984eb46f9808 100644
--- a/src/TNL/Matrices/TridiagonalRow_impl.h
+++ b/src/TNL/Matrices/TridiagonalRow_impl.h
@@ -61,11 +61,11 @@ setElement( const Index& elementIndex,
             const Index& column,
             const Real& value )
 {
-   Assert( this->values, );
-   Assert( this->step > 0,);
-   Assert( column >= 0 && column < this->columns,
+   TNL_ASSERT( this->values, );
+   TNL_ASSERT( this->step > 0,);
+   TNL_ASSERT( column >= 0 && column < this->columns,
               std::cerr << "column = " << columns << " this->columns = " << this->columns );
-   Assert( abs( column - row ) <= 1,
+   TNL_ASSERT( abs( column - row ) <= 1,
               std::cerr << "column = " << column << " row =  " << row );
 
    /****
diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h
index afd72d1cd658b69faa57049aa6f656bbacd95852..7876f07cb7e816817f6ca59aadebd32afb9874be 100644
--- a/src/TNL/Matrices/Tridiagonal_impl.h
+++ b/src/TNL/Matrices/Tridiagonal_impl.h
@@ -240,7 +240,7 @@ bool Tridiagonal< Real, Device, Index >::setRowFast( const IndexType row,
                                                               const RealType* values,
                                                               const IndexType elements )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    return this->addRowFast( row, columns, values, elements, 0.0 );
@@ -254,7 +254,7 @@ bool Tridiagonal< Real, Device, Index >::setRow( const IndexType row,
                                                           const RealType* values,
                                                           const IndexType elements )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    return this->addRow( row, columns, values, elements, 0.0 );
@@ -270,7 +270,7 @@ bool Tridiagonal< Real, Device, Index >::addRowFast( const IndexType row,
                                                               const IndexType elements,
                                                               const RealType& thisRowMultiplicator )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    if( elements > 3 )
@@ -294,7 +294,7 @@ bool Tridiagonal< Real, Device, Index >::addRow( const IndexType row,
                                                           const IndexType elements,
                                                           const RealType& thisRowMultiplicator )
 {
-   Assert( elements <= this->columns,
+   TNL_ASSERT( elements <= this->columns,
             std::cerr << " elements = " << elements
                  << " this->columns = " << this->columns );
    if( elements > 3 )
@@ -381,7 +381,7 @@ const typename Tridiagonal< Real, Device, Index >::MatrixRow
 Tridiagonal< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
 {
-   Assert( false, );
+   TNL_ASSERT( false, );
 }
 
 
@@ -408,10 +408,10 @@ template< typename Real,
 void Tridiagonal< Real, Device, Index >::vectorProduct( const InVector& inVector,
                                                                  OutVector& outVector ) const
 {
-   Assert( this->getColumns() == inVector.getSize(),
+   TNL_ASSERT( this->getColumns() == inVector.getSize(),
             std::cerr << "Matrix columns: " << this->getColumns() << std::endl
                  << "Vector size: " << inVector.getSize() << std::endl );
-   Assert( this->getRows() == outVector.getSize(),
+   TNL_ASSERT( this->getRows() == outVector.getSize(),
                std::cerr << "Matrix rows: " << this->getRows() << std::endl
                     << "Vector size: " << outVector.getSize() << std::endl );
 
@@ -426,7 +426,7 @@ void Tridiagonal< Real, Device, Index >::addMatrix( const Tridiagonal< Real2, De
                                                              const RealType& matrixMultiplicator,
                                                              const RealType& thisMatrixMultiplicator )
 {
-   Assert( this->getRows() == matrix.getRows(),
+   TNL_ASSERT( this->getRows() == matrix.getRows(),
             std::cerr << "This matrix columns: " << this->getColumns() << std::endl
                  << "This matrix rows: " << this->getRows() << std::endl );
 
@@ -471,7 +471,7 @@ template< typename Real,
 void Tridiagonal< Real, Device, Index >::getTransposition( const Tridiagonal< Real2, Device, Index2 >& matrix,
                                                                     const RealType& matrixMultiplicator )
 {
-   Assert( this->getRows() == matrix.getRows(),
+   TNL_ASSERT( this->getRows() == matrix.getRows(),
                std::cerr << "This matrix rows: " << this->getRows() << std::endl
                     << "That matrix rows: " << matrix.getRows() << std::endl );
    if( std::is_same< Device, Devices::Host >::value )
@@ -595,10 +595,10 @@ __cuda_callable__
 Index Tridiagonal< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                     const IndexType column ) const
 {
-   Assert( row >= 0 && column >= 0 && row < this->rows && column < this->rows,
+   TNL_ASSERT( row >= 0 && column >= 0 && row < this->rows && column < this->rows,
               std::cerr << " this->rows = " << this->rows
                    << " row = " << row << " column = " << column );
-   Assert( abs( row - column ) < 2,
+   TNL_ASSERT( abs( row - column ) < 2,
               std::cerr << "row = " << row << " column = " << column << std::endl );
    return TridiagonalDeviceDependentCode< Device >::getElementIndex( this->rows, row, column );
 }
diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h
index 2a78a73df331e209d1242293ca626b26a4e8be5e..d2a96c4ad5bae4bf19374c90a60cfbd5d146b100 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D.h
@@ -95,7 +95,7 @@ class Grid< 1, Real, Device, Index > : public Object
    inline const RealType& getCellMeasure() const;
  
    __cuda_callable__
-   inline VertexType getSpaceSteps() const;
+   inline const VertexType& getSpaceSteps() const;
 
    template< int xPow >
    __cuda_callable__
diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
index 8103d5b20b6fe5d54ab183f09da1bd4427e155ab..cac0ad3da8884483d4f48e91b62baba5b30b0dda 100644
--- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h
@@ -90,7 +90,7 @@ template< typename Real,
           typename Index  >
 void Grid< 1, Real, Device, Index >::setDimensions( const Index xSize )
 {
-   Assert( xSize > 0, std::cerr << "xSize = " << xSize );
+   TNL_ASSERT( xSize > 0, std::cerr << "xSize = " << xSize );
    this->dimensions.x() = xSize;
    this->numberOfCells = xSize;
    this->numberOfVertices = xSize + 1;
@@ -225,7 +225,7 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__ inline
-typename Grid< 1, Real, Device, Index >::VertexType
+const typename Grid< 1, Real, Device, Index >::VertexType&
 Grid< 1, Real, Device, Index >::
 getSpaceSteps() const
 {
@@ -241,7 +241,7 @@ const Real&
 Grid< 1, Real, Device, Index >::
 getSpaceStepsProducts() const
 {
-   Assert( xPow >= -2 && xPow <= 2,
+   TNL_ASSERT( xPow >= -2 && xPow <= 2,
               std::cerr << " xPow = " << xPow );
    return this->spaceStepsProducts[ xPow + 2 ];
 }
diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h
index b4d69458f96a73ec9b77fad35b978254ac3a1ce5..6c7dfca7f61089f61c8ec2d9e5d4be995f3c9906 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D.h
@@ -99,7 +99,7 @@ class Grid< 2, Real, Device, Index > : public Object
    inline const RealType& getCellMeasure() const;
  
    __cuda_callable__
-   inline VertexType getSpaceSteps() const;
+   inline const VertexType& getSpaceSteps() const;
 
    template< int xPow, int yPow >
    __cuda_callable__
diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
index b888298ae2696305f9978d3f6d100b2178ed89f2..4d0d87fb2f1d7f752c6330c94c90137b7cb3830d 100644
--- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h
@@ -134,8 +134,8 @@ template< typename Real,
           typename Index >
 void Grid< 2, Real, Device, Index > :: setDimensions( const Index xSize, const Index ySize )
 {
-   Assert( xSize > 0, std::cerr << "xSize = " << xSize );
-   Assert( ySize > 0, std::cerr << "ySize = " << ySize );
+   TNL_ASSERT( xSize > 0, std::cerr << "xSize = " << xSize );
+   TNL_ASSERT( ySize > 0, std::cerr << "ySize = " << ySize );
 
    this->dimensions.x() = xSize;
    this->dimensions.y() = ySize;
@@ -278,7 +278,7 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__ inline
-typename Grid< 2, Real, Device, Index >::VertexType
+const typename Grid< 2, Real, Device, Index >::VertexType&
 Grid< 2, Real, Device, Index >::
 getSpaceSteps() const
 {
@@ -294,9 +294,9 @@ const Real&
 Grid< 2, Real, Device, Index >::
 getSpaceStepsProducts() const
 {
-   Assert( xPow >= -2 && xPow <= 2,
+   TNL_ASSERT( xPow >= -2 && xPow <= 2,
               std::cerr << " xPow = " << xPow );
-   Assert( yPow >= -2 && yPow <= 2,
+   TNL_ASSERT( yPow >= -2 && yPow <= 2,
               std::cerr << " yPow = " << yPow );
 
    return this->spaceStepsProducts[ xPow + 2 ][ yPow + 2 ];
diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h
index 84e326996a95481f140e583e1295fa06997d899b..860b5e473528cf0de601648a77bb0bd6f0aba490 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D.h
@@ -92,7 +92,7 @@ class Grid< 3, Real, Device, Index > : public Object
    inline const RealType& getCellMeasure() const;
 
    __cuda_callable__
-   inline VertexType getSpaceSteps() const;
+   inline const VertexType& getSpaceSteps() const;
  
    template< int xPow, int yPow, int zPow >
    __cuda_callable__
diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
index bf4cd4b891b8932eb2abe7935f4da699049a3bf9..d9cdaa1dfe62359328f11c369cd16d0a43d168d5 100644
--- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h
@@ -164,9 +164,9 @@ template< typename Real,
           typename Index >
 void Grid< 3, Real, Device, Index > :: setDimensions( const Index xSize, const Index ySize, const Index zSize )
 {
-   Assert( xSize > 0, std::cerr << "xSize = " << xSize );
-   Assert( ySize > 0, std::cerr << "ySize = " << ySize );
-   Assert( zSize > 0, std::cerr << "zSize = " << zSize );
+   TNL_ASSERT( xSize > 0, std::cerr << "xSize = " << xSize );
+   TNL_ASSERT( ySize > 0, std::cerr << "ySize = " << ySize );
+   TNL_ASSERT( zSize > 0, std::cerr << "zSize = " << zSize );
 
    this->dimensions.x() = xSize;
    this->dimensions.y() = ySize;
@@ -325,7 +325,7 @@ template< typename Real,
           typename Device,
           typename Index >
 __cuda_callable__ inline
-typename Grid< 3, Real, Device, Index >::VertexType
+const typename Grid< 3, Real, Device, Index >::VertexType&
 Grid< 3, Real, Device, Index >::
 getSpaceSteps() const
 {
@@ -341,11 +341,11 @@ const Real&
 Grid< 3, Real, Device, Index >::
 getSpaceStepsProducts() const
 {
-   Assert( xPow >= -2 && xPow <= 2,
+   TNL_ASSERT( xPow >= -2 && xPow <= 2,
               std::cerr << " xPow = " << xPow );
-   Assert( yPow >= -2 && yPow <= 2,
+   TNL_ASSERT( yPow >= -2 && yPow <= 2,
               std::cerr << " yPow = " << yPow );
-   Assert( zPow >= -2 && zPow <= 2,
+   TNL_ASSERT( zPow >= -2 && zPow <= 2,
               std::cerr << " zPow = " << zPow );
 
    return this->spaceStepsProducts[ xPow + 2 ][ yPow + 2 ][ zPow + 2 ];
@@ -511,7 +511,7 @@ template< typename Real,
 bool Grid< 3, Real, Device, Index >::writeMesh( const String& fileName,
                                                    const String& format ) const
 {
-   Assert( false, std::cerr << "TODO: FIX THIS"); // TODO: FIX THIS
+   TNL_ASSERT( false, std::cerr << "TODO: FIX THIS"); // TODO: FIX THIS
    return true;
 }
 
diff --git a/src/TNL/Meshes/GridDetails/GridEntityGetter_impl.h b/src/TNL/Meshes/GridDetails/GridEntityGetter_impl.h
index e3f93e4599e6b1e30a46e25d1afdd2280b7ef00e..4c51e936a3078597c649aa293d8c451e4d542e3c 100644
--- a/src/TNL/Meshes/GridDetails/GridEntityGetter_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridEntityGetter_impl.h
@@ -44,7 +44,7 @@ class GridEntityGetter<
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
               std::cerr << " index = " << index
                    << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                    << " entityDimensions = " << entityDimensions );
@@ -59,7 +59,7 @@ class GridEntityGetter<
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() < grid.getDimensions() + CoordinatesType( 1 - entityDimensions ),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " grid.getDimensions() = " << grid.getDimensions()
@@ -90,7 +90,7 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -109,7 +109,7 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 2 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < grid.getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " grid.getDimensions() = " << grid.getDimensions() );
@@ -142,7 +142,7 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 1 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -171,10 +171,10 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 1 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < grid.getDimensions() + abs( entity.getOrientation() ),
                  std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
-                      << " dimensions.x() = " << grid.getDimensions()
+                      << " grid.getDimensions() = " << grid.getDimensions()
                       << " abs( entity.getOrientation() ) = " << abs( entity.getOrientation() ) );
  
          const CoordinatesType coordinates = entity.getCoordinates();
@@ -205,7 +205,7 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 0 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -225,7 +225,7 @@ class GridEntityGetter< Meshes::Grid< 2, Real, Device, Index >, GridEntity, 0 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= 0 && entity.getCoordinates() <= grid.getDimensions(),
+         TNL_ASSERT( entity.getCoordinates() >= 0 && entity.getCoordinates() <= grid.getDimensions(),
             std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                  << " grid.getDimensions() = " << grid.getDimensions() );
  
@@ -258,7 +258,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 3 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -278,7 +278,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 3 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < grid.getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " grid.getDimensions() = " << grid.getDimensions() );
@@ -310,7 +310,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 2 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -354,10 +354,10 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 2 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < grid.getDimensions() + abs( entity.getOrientation() ),
                  std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
-                      << " dimensions.x() = " << grid.getDimensions()
+                      << " grid.getDimensions() = " << grid.getDimensions()
                       << " abs( entity.getOrientation() ) = " << abs( entity.getOrientation() ) );
  
          const CoordinatesType coordinates = entity.getCoordinates();
@@ -400,7 +400,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 1 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -447,11 +447,11 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 1 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < grid.getDimensions() +
                        CoordinatesType( 1, 1, 1 ) - entity.getBasis(),
             std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
-                 << " dimensions.x() = " << grid.getDimensions()
+                 << " grid.getDimensions() = " << grid.getDimensions()
                  << " CoordinatesType( 1, 1, 1 ) - entity.getBasis() = " << CoordinatesType( 1, 1, 1 ) - entity.getBasis() );
  
          const CoordinatesType coordinates = entity.getCoordinates();
@@ -490,7 +490,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 0 >
       static GridEntity getEntity( const GridType& grid,
                                    const IndexType& index )
       {
-         Assert( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
+         TNL_ASSERT( index >= 0 && index < grid.template getEntitiesCount< GridEntity >(),
            std::cerr << " index = " << index
                 << " grid.getEntitiesCount<>() = " << grid.template getEntitiesCount< GridEntity >()
                 << " entityDimensions = " << entityDimensions );
@@ -512,7 +512,7 @@ class GridEntityGetter< Meshes::Grid< 3, Real, Device, Index >, GridEntity, 0 >
       static IndexType getEntityIndex( const GridType& grid,
                                        const GridEntity& entity )
       {
-         Assert( entity.getCoordinates() >= 0 && entity.getCoordinates() <= grid.getDimensions(),
+         TNL_ASSERT( entity.getCoordinates() >= 0 && entity.getCoordinates() <= grid.getDimensions(),
             std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                  << " grid.getDimensions() = " << grid.getDimensions() );
  
diff --git a/src/TNL/Meshes/GridDetails/GridEntity_impl.h b/src/TNL/Meshes/GridDetails/GridEntity_impl.h
index 6510e2ab2102d59c3ca9816e1bb8ad222a081e23..6cb58269e73c1d12d75d8e5b8f3b610b12c9156d 100644
--- a/src/TNL/Meshes/GridDetails/GridEntity_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridEntity_impl.h
@@ -139,11 +139,11 @@ getIndex() const
 {
    typedef Meshes::Grid< Dimensions, Real, Device, Index > GridType;
    typedef typename GridType::template MeshEntity< EntityDimensions > EntityType;
-   Assert( this->entityIndex >= 0 &&
+   TNL_ASSERT( this->entityIndex >= 0 &&
               this-> entityIndex < grid.template getEntitiesCount< EntityType >(),
               std::cerr << "this->entityIndex = " << this->entityIndex
                    << " grid.template getEntitiesCount< EntityDimensions >() = " << grid.template getEntitiesCount< EntityType >() );
-   Assert( this->entityIndex == grid.getEntityIndex( *this ),
+   TNL_ASSERT( this->entityIndex == grid.getEntityIndex( *this ),
               std::cerr << "this->entityIndex = " << this->entityIndex
                    << " grid.getEntityIndex( *this ) = " << grid.getEntityIndex( *this ) );
    return this->entityIndex;
@@ -387,11 +387,11 @@ Index
 GridEntity< Meshes::Grid< Dimensions, Real, Device, Index >, Dimensions, Config >::
 getIndex() const
 {
-   Assert( this->entityIndex >= 0 &&
+   TNL_ASSERT( this->entityIndex >= 0 &&
               this-> entityIndex < grid.template getEntitiesCount< ThisType >(),
               std::cerr << "this->entityIndex = " << this->entityIndex
                    << " grid.template getEntitiesCount< Dimensions >() = " << grid.template getEntitiesCount< ThisType >() );
-   Assert( this->entityIndex == grid.getEntityIndex( *this ),
+   TNL_ASSERT( this->entityIndex == grid.getEntityIndex( *this ),
               std::cerr << "this->index = " << this->entityIndex
                    << " grid.getEntityIndex( *this ) = " << grid.getEntityIndex( *this ) );
    return this->entityIndex;
@@ -605,11 +605,11 @@ getIndex() const
 {
    typedef Meshes::Grid< Dimensions, Real, Device, Index > GridType;
    typedef typename GridType::Vertex Vertex;
-   Assert( this->entityIndex >= 0 &&
+   TNL_ASSERT( this->entityIndex >= 0 &&
               this-> entityIndex < grid.template getEntitiesCount< Vertex >(),
               std::cerr << "this->entityIndex = " << this->entityIndex
                    << " grid.template getEntitiesCount< 0 >() = " << grid.template getEntitiesCount< Vertex >() );
-   Assert( this->entityIndex == grid.getEntityIndex( *this ),
+   TNL_ASSERT( this->entityIndex == grid.getEntityIndex( *this ),
               std::cerr << "this->entityIndex = " << this->entityIndex
                    << " grid.getEntityIndex( *this ) = " << grid.getEntityIndex( *this ) );
    return this->entityIndex;
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
index 458069ce5d1c9af6f00f519d5e3c0044fa544fee..3cedde53c4285aa8cf1434309fc41998891d9a53 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h
@@ -312,17 +312,14 @@ GridTraverser2D(
    coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;  
 
-   if( coordinates <= end )
-   {
+   if( ( !processOnlyBoundaryEntities && coordinates <= end ) ||
+       (  processOnlyBoundaryEntities &&
+          ( coordinates.x() == begin.x() || coordinates.y() == begin.y() ||
+            coordinates.x() == end.x() || coordinates.y() == end.y() ) ) )
+   { 
       GridEntity entity( *grid, coordinates, gridEntityParameters... );
       entity.refresh();
-      if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
-      {
-         EntitiesProcessor::processEntity
-         ( entity.getMesh(),
-           *userData,
-           entity );
-      }
+      EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity );      
    }
 }
 #endif
@@ -522,7 +519,19 @@ GridTraverser3D(
    coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
    coordinates.z() = begin.z() + ( gridZIdx * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
 
-   if( coordinates <= end )
+   
+   if( ( !processOnlyBoundaryEntities && coordinates <= end ) ||
+    (  processOnlyBoundaryEntities &&
+       ( coordinates.x() == begin.x() || coordinates.y() == begin.y() || coordinates.z() == begin.z() ||
+         coordinates.x() == end.x() || coordinates.y() == end.y() || coordinates.z() == end.z() ) ) )
+   { 
+      GridEntity entity( *grid, coordinates, gridEntityParameters... );
+      entity.refresh();
+      EntitiesProcessor::processEntity( entity.getMesh(), *userData, entity );      
+   }
+
+   
+   /*if( coordinates <= end )
    {
       GridEntity entity( *grid, coordinates, gridEntityParameters... );
       entity.refresh();
@@ -533,7 +542,7 @@ GridTraverser3D(
            *userData,
            entity );
       }
-   }
+   }*/
 }
 #endif
 
diff --git a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter.h b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter.h
index 3bf6d56900178efec502afab32bd7c8d815279fd..e956b31c45732650101f0cc3b11157cf77ecce00 100644
--- a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter.h
+++ b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter.h
@@ -29,14 +29,14 @@ class NeighbourGridEntityGetter
       __cuda_callable__
       NeighbourGridEntityGetter( const GridEntity& entity )
       {
-         //Assert( false, );
+         //TNL_ASSERT( false, );
       }
  
       __cuda_callable__
       void refresh( const typename GridEntity::GridType& grid,
                     const typename GridEntity::IndexType& entityIndex )
       {
-         //Assert( false, );
+         //TNL_ASSERT( false, );
       }
 
 };
diff --git a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter1D_impl.h b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter1D_impl.h
index ec0509da104c6df20d35d70c0505497042ede582..246c508d49ba4eb9ce010f5182d20d077e09f873 100644
--- a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter1D_impl.h
@@ -56,12 +56,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( this->entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( this->entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     this->entity.getCoordinates() < this->entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << this->entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() + CoordinatesType( step ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -73,12 +73,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() + CoordinatesType( step ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -136,12 +136,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( this->entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( this->entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     this->entity.getCoordinates() < this->entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << this->entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() + CoordinatesType( step ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -153,12 +153,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( step ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() + CoordinatesType( step ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -237,12 +237,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step + ( step < 0 ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step + ( step < 0 ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step + ( step < 0 ) <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -254,12 +254,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step + ( step < 0 ) >= CoordinatesType( 0 ).x() &&
+         TNL_ASSERT( entity.getCoordinates().x() + step + ( step < 0 ) >= CoordinatesType( 0 ).x() &&
                     entity.getCoordinates().x() + step + ( step < 0 ) <= entity.getMesh().getDimensions().x(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -318,12 +318,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step - ( step > 0 ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -335,12 +335,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step - ( step > 0 ) >= 0 &&
+         TNL_ASSERT( entity.getCoordinates().x() + step - ( step > 0 ) >= 0 &&
                     entity.getCoordinates().x() + step - ( step > 0 ) < entity.getMesh().getDimensions().x(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -394,12 +394,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step - ( step > 0 ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -411,12 +411,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step - ( step > 0 ) >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step - ( step > 0 ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -470,12 +470,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -487,12 +487,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates().x() + step >= CoordinatesType( 0 ) &&
+         TNL_ASSERT( entity.getCoordinates().x() + step >= CoordinatesType( 0 ) &&
                     entity.getCoordinates().x() + step <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
diff --git a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter2D_impl.h b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter2D_impl.h
index 45f3e225a99a56f5c522794deab57aa81ab30cb3..b87c08d325ccb08fc4bb17f267f8bd4e34d0d436 100644
--- a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter2D_impl.h
@@ -56,12 +56,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -75,12 +75,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -140,12 +140,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -159,12 +159,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -269,14 +269,14 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( ! stepX + ! stepY == 1,
+         TNL_ASSERT( ! stepX + ! stepY == 1,
                     std::cerr << "Only one of the steps can be non-zero: stepX = " << stepX << " stepY = " << stepY );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ),
                                         stepY + ( stepY < 0 ) ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() +
@@ -348,14 +348,14 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( stepX != 0 && stepY != 0,
+         TNL_ASSERT( stepX != 0 && stepY != 0,
                     std::cerr << " stepX = " << stepX << " stepY = " << stepY );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ), stepY + ( stepY < 0 ) ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ), stepY + ( stepY < 0 ) )
@@ -425,16 +425,16 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         /*Assert( ( ( !! stepX ) == ( !! entity.getOrientation().x() ) ) &&
+         /*TNL_ASSERT( ( ( !! stepX ) == ( !! entity.getOrientation().x() ) ) &&
                     ( ( !! stepY ) == ( !! entity.getOrientation().y() ) ),
                     std::cerr << "( stepX, stepY ) cannot be perpendicular to entity coordinates: stepX = " << stepX << " stepY = " << stepY
                          << " entity.getOrientation() = " << entity.getOrientation() );*/
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions() + entity.getOrientation(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() + entity.getOrientation() = " << entity.getMesh().getDimensions() + entity.getOrientation()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX - ( stepX > 0 ) * ( entity.getOrientation().x() != 0.0 ),
                                         stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ) ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() +
@@ -502,12 +502,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -521,12 +521,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
diff --git a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter3D_impl.h b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter3D_impl.h
index d4d25ad0b9b824f125e338fd1c017377edbab5b9..e14eb0fbc1c62c2f49296de72d9a601c4f5d609e 100644
--- a/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighbourGridEntityGetter3D_impl.h
@@ -56,12 +56,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -75,12 +75,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ ) = "
                    << entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ )
@@ -142,12 +142,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY ) = " << entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ )
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
@@ -161,12 +161,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ ) = "
                    << entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ )
@@ -290,16 +290,16 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( ! stepX + ! stepY + ! stepZ == 2,
+         TNL_ASSERT( ! stepX + ! stepY + ! stepZ == 2,
                     std::cerr << "Only one of the steps can be non-zero: stepX = " << stepX
                          << " stepY = " << stepY
                          << " stepZ = " << stepZ );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ),
                                         stepY + ( stepY < 0 ),
                                         stepZ + ( stepZ < 0 ) ) >= CoordinatesType( 0, 0, 0 ) &&
@@ -379,16 +379,16 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( ! stepX + ! stepY + ! stepZ == 2,
+         TNL_ASSERT( ! stepX + ! stepY + ! stepZ == 2,
                     std::cerr << "Only one of the steps can be non-zero: stepX = " << stepX
                          << " stepY = " << stepY
                          << " stepZ = " << stepZ );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ),
                                         stepY + ( stepY < 0 ),
                                         stepZ + ( stepZ < 0 ) ) >= CoordinatesType( 0, 0, 0 ) &&
@@ -469,16 +469,16 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( ! stepX + ! stepY + ! stepZ == 1,
+         TNL_ASSERT( ! stepX + ! stepY + ! stepZ == 1,
                     std::cerr << "Exactly two of the steps must be non-zero: stepX = " << stepX
                          << " stepY = " << stepY
                          << " stepZ = " << stepZ );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ),
                                         stepY + ( stepY < 0 ),
                                         stepZ + ( stepZ < 0 ) ) >= CoordinatesType( 0, 0, 0 ) &&
@@ -554,16 +554,16 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( stepX != 0 && stepY != 0 && stepZ != 0,
+         TNL_ASSERT( stepX != 0 && stepY != 0 && stepZ != 0,
                     std::cerr << " stepX = " << stepX
                          << " stepY = " << stepY
                          << " stepZ = " << stepZ );
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX + ( stepX < 0 ),
                                         stepY + ( stepY < 0 ),
                                         stepZ + ( stepZ < 0 ) ) >= CoordinatesType( 0, 0, 0 ) &&
@@ -637,18 +637,18 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         /*Assert( ( ( !! stepX ) == ( !! entity.getOrientation().x() ) ) &&
+         /*TNL_ASSERT( ( ( !! stepX ) == ( !! entity.getOrientation().x() ) ) &&
                     ( ( !! stepY ) == ( !! entity.getOrientation().y() ) ) &&
                     ( ( !! stepZ ) == ( !! entity.getOrientation().z() ) ),
                     std::cerr << "( stepX, stepY, stepZ ) cannot be perpendicular to entity coordinates: stepX = " << stepX
                          << " stepY = " << stepY << " stepZ = " << stepZ
                          << " entity.getOrientation() = " << entity.getOrientation() );*/
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() < entity.getMesh().getDimensions() + entity.getOrientation(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() + entity.getOrientation() = " << entity.getMesh().getDimensions() + entity.getOrientation()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() +
+         TNL_ASSERT( entity.getCoordinates() +
                        CoordinatesType( stepX - ( stepX > 0 ) * ( entity.getOrientation().x() != 0.0 ),
                                         stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ),
                                         stepZ - ( stepZ > 0 ) * ( entity.getOrientation().z() != 0.0 ) ) >= CoordinatesType( 0, 0, 0 ) &&
@@ -723,12 +723,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       NeighbourGridEntityType getEntity() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ ) = "
                    << entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ )
@@ -743,12 +743,12 @@ class NeighbourGridEntityGetter<
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
-         Assert( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates() = " << entity.getCoordinates()
                    << " entity.getMesh().getDimensions() = " << entity.getMesh().getDimensions()
                    << " EntityDimensions = " << EntityDimensions );
-         Assert( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
+         TNL_ASSERT( entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) >= CoordinatesType( 0, 0, 0 ) &&
                     entity.getCoordinates() + CoordinatesType( stepX, stepY, stepZ ) <= entity.getMesh().getDimensions(),
               std::cerr << "entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ ) = "
                    << entity.getCoordinates()  + CoordinatesType( stepX, stepY, stepZ )
diff --git a/src/TNL/Meshes/MeshBuilder.h b/src/TNL/Meshes/MeshBuilder.h
index d9782c4c28ae53610c4760839ee6e7273964cba9..2eac2aa8b717868d19bbee442a6755d49727ee47 100644
--- a/src/TNL/Meshes/MeshBuilder.h
+++ b/src/TNL/Meshes/MeshBuilder.h
@@ -31,7 +31,7 @@ class MeshBuilder
 
    bool setPointsCount( const GlobalIndexType& points )
    {
-      Assert( 0 <= points, std::cerr << "pointsCount = " << points );
+      TNL_ASSERT( 0 <= points, std::cerr << "pointsCount = " << points );
       this->points.setSize( points );
       this->pointsSet.setSize( points );
       pointsSet.setValue( false );
@@ -40,7 +40,7 @@ class MeshBuilder
  
    bool setCellsCount( const GlobalIndexType& cellsCount )
    {
-      Assert( 0 <= cellsCount, std::cerr << "cellsCount = " << cellsCount );
+      TNL_ASSERT( 0 <= cellsCount, std::cerr << "cellsCount = " << cellsCount );
       this->cellSeeds.setSize( cellsCount );
       return true;
    }
@@ -52,7 +52,7 @@ class MeshBuilder
    void setPoint( GlobalIndexType index,
                  const PointType& point )
    {
-	Assert( 0 <= index && index < getPointsCount(), std::cerr << "Index = " << index );
+	TNL_ASSERT( 0 <= index && index < getPointsCount(), std::cerr << "Index = " << index );
 
         this->points[ index ] = point;
         this->pointsSet[ index ] = true;
@@ -60,7 +60,7 @@ class MeshBuilder
 
    CellSeedType& getCellSeed( GlobalIndexType index )
    {
-      Assert( 0 <= index && index < getCellsCount(), std::cerr << "Index = " << index );
+      TNL_ASSERT( 0 <= index && index < getCellsCount(), std::cerr << "Index = " << index );
  
       return this->cellSeeds[ index ];
    }
diff --git a/src/TNL/Meshes/MeshDetails/MeshEntityId.h b/src/TNL/Meshes/MeshDetails/MeshEntityId.h
index f595042cf92cb95b33756d0bc4d959c75a315433..d3aa829b7d669830d947992e076e5c2a0e252590 100644
--- a/src/TNL/Meshes/MeshDetails/MeshEntityId.h
+++ b/src/TNL/Meshes/MeshDetails/MeshEntityId.h
@@ -25,7 +25,7 @@ class MeshEntityId
 
    const IDType &getId() const
    {
-      Assert( this->id >= 0, );
+      TNL_ASSERT( this->id >= 0, );
       return this->id;
    }
 
diff --git a/src/TNL/Meshes/MeshDetails/MeshEntityReferenceOrientation.h b/src/TNL/Meshes/MeshDetails/MeshEntityReferenceOrientation.h
index 73fb5e80dcb67444423287347f75c0b2d6a44180..335923a49cdf5d79d7fe9627ae9e7653446552e0 100644
--- a/src/TNL/Meshes/MeshDetails/MeshEntityReferenceOrientation.h
+++ b/src/TNL/Meshes/MeshDetails/MeshEntityReferenceOrientation.h
@@ -30,7 +30,7 @@ class MeshEntityReferenceOrientation
          auto referenceCornerIds = referenceSeed.getCornerIds();
          for( LocalIndexType i = 0; i < referenceCornerIds.getSize(); i++ )
          {
-            Assert( this->cornerIdsMap.find( referenceCornerIds[i]) == this->cornerIdsMap.end(), );
+            TNL_ASSERT( this->cornerIdsMap.find( referenceCornerIds[i]) == this->cornerIdsMap.end(), );
             this->cornerIdsMap.insert( std::make_pair( referenceCornerIds[i], i ) );
          }
       }
@@ -43,7 +43,7 @@ class MeshEntityReferenceOrientation
          auto cornerIds = seed.getCornerIds();
          for( LocalIndexType i = 0; i < cornerIds.getSize(); i++ )
          {
-            Assert( this->cornerIdsMap.find( cornerIds[ i ] ) != this->cornerIdsMap.end(), );
+            TNL_ASSERT( this->cornerIdsMap.find( cornerIds[ i ] ) != this->cornerIdsMap.end(), );
             result.setPermutationValue( i, this->cornerIdsMap.find( cornerIds[ i ])->second );
          }
          return result;
diff --git a/src/TNL/Meshes/MeshDetails/MeshEntity_impl.h b/src/TNL/Meshes/MeshDetails/MeshEntity_impl.h
index 65d96194d6811e783e5b154d6fffd9e517b79242..fe533414aa505d28681402e7bdf450e156222a89 100644
--- a/src/TNL/Meshes/MeshDetails/MeshEntity_impl.h
+++ b/src/TNL/Meshes/MeshDetails/MeshEntity_impl.h
@@ -146,7 +146,7 @@ MeshEntity< MeshConfig, EntityTopology >::
 getSubentityIndex( const LocalIndexType localIndex) const
 {
    static_assert( SubentityTraits< Subdimensions >::storageEnabled, "You try to get subentity which is not configured for storage." );
-   Assert( 0 <= localIndex &&
+   TNL_ASSERT( 0 <= localIndex &&
               localIndex < SubentityTraits< Subdimensions >::count,
               std::cerr << "localIndex = " << localIndex
                    << " subentitiesCount = "
@@ -200,7 +200,7 @@ MeshEntity< MeshConfig, EntityTopology >::
 getSuperentityIndex( const LocalIndexType localIndex ) const
 {
    static_assert( SuperentityTraits< SuperDimensions >::storageEnabled, "You try to get superentity which is not configured for storage." );
-   Assert( localIndex < this->getNumberOfSuperentities< SuperDimensions >(),
+   TNL_ASSERT( localIndex < this->getNumberOfSuperentities< SuperDimensions >(),
               std::cerr << " localIndex = " << localIndex
                    << " this->getNumberOfSuperentities< Dimensions >() = " << this->getNumberOfSuperentities< SuperDimensions >() << std::endl; );
    typedef MeshSuperentityAccess< MeshConfig, EntityTopology >  SuperentityBaseType;
@@ -276,7 +276,7 @@ MeshEntity< MeshConfig, EntityTopology >::
 subentityOrientation( LocalIndexType index ) const
 {
    static const LocalIndexType subentitiesCount = SubentityTraits< Dimensions >::count;
-   Assert( 0 <= index && index < subentitiesCount, );
+   TNL_ASSERT( 0 <= index && index < subentitiesCount, );
 
    return SubentityStorageLayers::subentityOrientation( MeshDimensionsTag< Dimensions >(), index );
 }
@@ -294,7 +294,7 @@ setSubentityIndex( const LocalIndexType localIndex,
                    const GlobalIndexType globalIndex )
 {
    static_assert( SubentityTraits< Subdimensions >::storageEnabled, "You try to set subentity which is not configured for storage." );
-   Assert( 0 <= localIndex &&
+   TNL_ASSERT( 0 <= localIndex &&
               localIndex < SubentityTraits< Subdimensions >::count,
               std::cerr << "localIndex = " << localIndex
                    << " subentitiesCount = "
@@ -448,7 +448,7 @@ typename MeshEntity< MeshConfig, MeshVertexTopology >::GlobalIndexType
 MeshEntity< MeshConfig, MeshVertexTopology >::
 getSuperentityIndex( const LocalIndexType localIndex ) const
 {
-   Assert( localIndex < this->getNumberOfSuperentities< Dimensions >(),
+   TNL_ASSERT( localIndex < this->getNumberOfSuperentities< Dimensions >(),
               std::cerr << " localIndex = " << localIndex
                    << " this->getNumberOfSuperentities< Dimensions >() = " << this->getNumberOfSuperentities< Dimensions >() << std::endl; );
    typedef MeshSuperentityAccess< MeshConfig, MeshVertexTopology >  SuperentityBaseType;
diff --git a/src/TNL/Meshes/MeshDetails/initializer/MeshEntitySeed.h b/src/TNL/Meshes/MeshDetails/initializer/MeshEntitySeed.h
index bdb685de379a249e43838146c5c34b7486f2803e..4fecf51bd4c4f5fd77ebcf3f1c3dca58202febda 100644
--- a/src/TNL/Meshes/MeshDetails/initializer/MeshEntitySeed.h
+++ b/src/TNL/Meshes/MeshDetails/initializer/MeshEntitySeed.h
@@ -37,8 +37,8 @@ class MeshEntitySeed
 
       void setCornerId( LocalIndexType cornerIndex, GlobalIndexType pointIndex )
       {
-         Assert( 0 <= cornerIndex && cornerIndex < getCornersCount(), std::cerr << "cornerIndex = " << cornerIndex );
-         Assert( 0 <= pointIndex, std::cerr << "pointIndex = " << pointIndex );
+         TNL_ASSERT( 0 <= cornerIndex && cornerIndex < getCornersCount(), std::cerr << "cornerIndex = " << cornerIndex );
+         TNL_ASSERT( 0 <= pointIndex, std::cerr << "pointIndex = " << pointIndex );
 
          this->cornerIds[ cornerIndex ] = pointIndex;
       }
diff --git a/src/TNL/Meshes/MeshDetails/initializer/MeshInitializer.h b/src/TNL/Meshes/MeshDetails/initializer/MeshInitializer.h
index 22e88efda6d845112e217c93eb53045790330d38..703843c27e7c00bba3912c807e5d18dc71e011fd 100644
--- a/src/TNL/Meshes/MeshDetails/initializer/MeshInitializer.h
+++ b/src/TNL/Meshes/MeshDetails/initializer/MeshInitializer.h
@@ -570,13 +570,13 @@ class MeshInitializerLayer< MeshConfig,
 
       MeshType& getMesh()
       {
-         Assert( this->mesh, );
+         TNL_ASSERT( this->mesh, );
          return *( this->mesh );
       }
 
       VertexInitializerType& getEntityInitializer( DimensionsTag, GlobalIndexType index )
       {
-         Assert( index >= 0 && index < vertexInitializerContainer.getSize(),
+         TNL_ASSERT( index >= 0 && index < vertexInitializerContainer.getSize(),
                   std::cerr << " index = " << index
                        << " vertexInitializerContainer.getSize() = " << vertexInitializerContainer.getSize() << std::endl; );
          return vertexInitializerContainer[ index ];
diff --git a/src/TNL/Meshes/MeshDetails/layers/MeshSubentityStorageLayer.h b/src/TNL/Meshes/MeshDetails/layers/MeshSubentityStorageLayer.h
index 955aa90d3da214b7624d714f93e410b5d54d97ac..6acf7017df34aab6c4258a25085aaa3973308e86 100644
--- a/src/TNL/Meshes/MeshDetails/layers/MeshSubentityStorageLayer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/MeshSubentityStorageLayer.h
@@ -145,7 +145,7 @@ class MeshSubentityStorageLayer< MeshConfig,
    using BaseType::subentityOrientation;
    IdPermutationArrayAccessorType subentityOrientation( DimensionsTag, LocalIndexType index) const
    {
-      Assert( 0 <= index && index < SubentityTraitsType::count, );
+      TNL_ASSERT( 0 <= index && index < SubentityTraitsType::count, );
  
       return this->subentityOrientations[ index ].getSubvertexPermutation();
    }
diff --git a/src/TNL/Meshes/MeshDetails/layers/MeshSuperentityStorageLayer.h b/src/TNL/Meshes/MeshDetails/layers/MeshSuperentityStorageLayer.h
index f5fa7c10e45a808cd0d0c51ebf6525ac287ac2db..82ba0cf1c294fdbe78f956e03ca2d31b82e70304 100644
--- a/src/TNL/Meshes/MeshDetails/layers/MeshSuperentityStorageLayer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/MeshSuperentityStorageLayer.h
@@ -250,13 +250,13 @@ class MeshSuperentityStorageLayer< MeshConfig, EntityTopology, MeshDimensionsTag
  
    typename MeshTraits< MeshConfig >::GlobalIdArrayType& superentityIdsArray( DimensionsTag )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
       //return this->superentitiesIndices;
    }
 
    StorageNetworkType& getStorageNetwork( DimensionsTag )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
      //return this->storageNetwork;
    }
 
@@ -323,13 +323,13 @@ class MeshSuperentityStorageLayer< MeshConfig,
  
    typename MeshTraits< MeshConfig >::GlobalIdArrayType& superentityIdsArray( DimensionsTag )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
       //return this->superentitiesIndices;
    }
 
    StorageNetworkType& getStorageNetwork( DimensionsTag )
    {
-      Assert( false, );
+      TNL_ASSERT( false, );
       //return this->storageNetwork;
    }
 
diff --git a/src/TNL/Meshes/MeshDetails/traits/MeshSuperentityTraits.h b/src/TNL/Meshes/MeshDetails/traits/MeshSuperentityTraits.h
index 37c0ff6caaf4ee377b6dda679cbe93d40c844043..1f2739e3fd4da1b51b498e6b56e4c9bd8ae24790 100644
--- a/src/TNL/Meshes/MeshDetails/traits/MeshSuperentityTraits.h
+++ b/src/TNL/Meshes/MeshDetails/traits/MeshSuperentityTraits.h
@@ -12,7 +12,7 @@
 
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/ConstSharedArray.h>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/Meshes/MeshEntity.h>
 #include <TNL/Meshes/MeshConfigBase.h>
 #include <TNL/Meshes/Topologies/MeshEntityTopology.h>
@@ -60,7 +60,7 @@ class MeshSuperentityTraits
    /****
     * This is used by the mesh initializer.
     */
-   typedef List< GlobalIndexType >                                       GrowableContainerType;
+   typedef Containers::List< GlobalIndexType >                                       GrowableContainerType;
 
 };
 
diff --git a/src/TNL/Object.cpp b/src/TNL/Object.cpp
index 06a819137a4f1f9bcbe3603263a9fd2762a317e0..14418d26da1412d8f3785fce4f0c76fb0391e948 100644
--- a/src/TNL/Object.cpp
+++ b/src/TNL/Object.cpp
@@ -10,8 +10,6 @@
 
 #include <TNL/Object.h>
 #include <TNL/Assert.h>
-#include <TNL/File.h>
-#include <TNL/List.h>
 #include <iostream>
 #include <fstream>
 #include <cstring>
@@ -171,7 +169,7 @@ bool getObjectType( const String& fileName, String& type )
 }
 
 bool parseObjectType( const String& objectType,
-                      List< String >& parsedObjectType )
+                      Containers::List< String >& parsedObjectType )
 {
    parsedObjectType.reset();
    int objectTypeLength = objectType. getLength();
@@ -191,7 +189,7 @@ bool parseObjectType( const String& objectType,
 
    /****
     * Now, we will extract the parameters.
-    * Each parameter can be template, so we must compute and pair
+    * Each parameter can be template, so we must count and pair
     * '<' with '>'.
     */
    int templateBrackets( 0 );
@@ -203,13 +201,12 @@ bool parseObjectType( const String& objectType,
          templateBrackets ++;
       if( ! templateBrackets )
       {
-         if( objectType[ i ] == ' ' ||
-             objectType[ i ] == ',' ||
+         if( objectType[ i ] == ',' ||
              objectType[ i ] == '>' )
          {
             if( buffer != "" )
             {
-               if( ! parsedObjectType. Append( buffer ) )
+               if( ! parsedObjectType. Append( buffer.strip( ' ' ) ) )
                   return false;
                buffer. setString( "" );
             }
diff --git a/src/TNL/Object.h b/src/TNL/Object.h
index d05c7595a4a9ee0eaa11be299440a1d247122e22..ed0ac90de971773bf52b01bb8715b8f43e425908 100644
--- a/src/TNL/Object.h
+++ b/src/TNL/Object.h
@@ -12,13 +12,11 @@
 
 #include <TNL/Devices/Cuda.h>
 #include <TNL/String.h>
-
+#include <TNL/File.h>
+#include <TNL/Containers/List.h>
 
 namespace TNL {
 
-class File;
-template< class T > class List;
-
 //! This is basic class for all 'large' objects like matrices, meshes, grids, solvers etc.
 /*!
  *  Objects like numerical grids, meshes, matrices large vectors etc.
@@ -92,6 +90,6 @@ bool getObjectType( File& file, String& type );
 bool getObjectType( const String& file_name, String& type );
 
 bool parseObjectType( const String& objectType,
-                      List< String >& parsedObjectType );
+                      Containers::List< String >& parsedObjectType );
 
 } // namespace TNL
diff --git a/src/TNL/Operators/euler/fvm/LaxFridrichs.h b/src/TNL/Operators/euler/fvm/LaxFridrichs.h
index 9aa009ea1396322065a3032b00871c84f3b68347..60e0a86246ca6b6eaebccb5d7f92217d9ff8f267 100644
--- a/src/TNL/Operators/euler/fvm/LaxFridrichs.h
+++ b/src/TNL/Operators/euler/fvm/LaxFridrichs.h
@@ -44,13 +44,13 @@ class LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, GridGeometry >, Pressu
 
    static String getTypeStatic();
 
-   void getExplicitRhs( const IndexType centralVolume,
+   void getExplicitUpdate( const IndexType centralVolume,
                         RealType& rho_t,
                         RealType& rho_u1_t,
                         RealType& rho_u2_t,
                         const RealType& tau ) const;
 
-   void getExplicitRhs( const IndexType centralVolume,
+   void getExplicitUpdate( const IndexType centralVolume,
                         RealType& rho_t,
                         RealType& rho_u1_t,
                         RealType& rho_u2_t,
@@ -111,13 +111,13 @@ class LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, tnlIdenticalGridGeomet
 
    LaxFridrichs();
 
-   void getExplicitRhs( const IndexType centralVolume,
+   void getExplicitUpdate( const IndexType centralVolume,
                         RealType& rho_t,
                         RealType& rho_u1_t,
                         RealType& rho_u2_t,
                         const RealType& tau ) const;
 
-   void getExplicitRhs( const IndexType centralVolume,
+   void getExplicitUpdate( const IndexType centralVolume,
                         RealType& rho_t,
                         RealType& rho_u1_t,
                         RealType& rho_u2_t,
diff --git a/src/TNL/Operators/euler/fvm/LaxFridrichs_impl.h b/src/TNL/Operators/euler/fvm/LaxFridrichs_impl.h
index 684d5c4e1c1febbb0692ecfa8e1f6839bef9009a..88a780ad3bdf4a5da12e0d167b90dfb56938335e 100644
--- a/src/TNL/Operators/euler/fvm/LaxFridrichs_impl.h
+++ b/src/TNL/Operators/euler/fvm/LaxFridrichs_impl.h
@@ -141,14 +141,14 @@ template< typename Real,
           typename Index,
           typename PressureGradient,
           template< int, typename, typename, typename > class GridGeometry >
-void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, GridGeometry >, PressureGradient  > :: getExplicitRhs( const IndexType centralVolume,
+void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, GridGeometry >, PressureGradient  > :: getExplicitUpdate( const IndexType centralVolume,
                                                                                                               RealType& rho_t,
                                                                                                               RealType& rho_u1_t,
                                                                                                               RealType& rho_u2_t,
                                                                                                               const RealType& tau ) const
 {
-   Assert( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
-   Assert( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
+   TNL_ASSERT( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
+   TNL_ASSERT( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
 
    const IndexType& c = centralVolume;
    const IndexType e = this->mesh -> getElementNeighbour( centralVolume,  1,  0 );
@@ -392,14 +392,14 @@ template< typename Real,
           typename Device,
           typename Index,
           typename PressureGradient >
-void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, tnlIdenticalGridGeometry >, PressureGradient  > :: getExplicitRhs( const IndexType centralVolume,
+void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, tnlIdenticalGridGeometry >, PressureGradient  > :: getExplicitUpdate( const IndexType centralVolume,
                                                                                                                           RealType& rho_t,
                                                                                                                           RealType& rho_u1_t,
                                                                                                                           RealType& rho_u2_t,
                                                                                                                           const RealType& tau ) const
 {
-   Assert( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
-   Assert( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
+   TNL_ASSERT( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
+   TNL_ASSERT( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
 
    const IndexType& xSize = this->mesh -> getDimensions(). x();
    const IndexType& ySize = this->mesh -> getDimensions(). y();
@@ -449,15 +449,15 @@ template< typename Real,
           typename Device,
           typename Index,
           typename PressureGradient >
-void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, tnlIdenticalGridGeometry >, PressureGradient  > :: getExplicitRhs( const IndexType centralVolume,
+void LaxFridrichs< Meshes::Grid< 2, Real, Device, Index, tnlIdenticalGridGeometry >, PressureGradient  > :: getExplicitUpdate( const IndexType centralVolume,
                                                                                                                           RealType& rho_t,
                                                                                                                           RealType& rho_u1_t,
                                                                                                                           RealType& rho_u2_t,
                                                                                                                           RealType& e_t,
                                                                                                                           const RealType& tau ) const
 {
-   Assert( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
-   Assert( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
+   TNL_ASSERT( mesh, std::cerr << "No mesh has been binded with the Lax-Fridrichs scheme." );
+   TNL_ASSERT( pressureGradient, std::cerr << "No pressure gradient was set in the the Lax-Fridrichs scheme." )
 
    const IndexType& xSize = this->mesh -> getDimensions(). x();
    const IndexType& ySize = this->mesh -> getDimensions(). y();
diff --git a/src/TNL/Problems/HeatEquationProblem.h b/src/TNL/Problems/HeatEquationProblem.h
index 94bc122fdbb74b4b1cdc7ae8d1d1abac33f293ca..25e5a193383a1132a2cc7e38403b258213de6d7a 100644
--- a/src/TNL/Problems/HeatEquationProblem.h
+++ b/src/TNL/Problems/HeatEquationProblem.h
@@ -44,7 +44,7 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       typedef Functions::MeshFunction< Mesh > MeshFunctionType;
       typedef SharedPointer< MeshFunctionType, DeviceType > MeshFunctionPointer;
       typedef PDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;
-      typedef Matrices::CSR< RealType, DeviceType, IndexType > MatrixType;
+      typedef Matrices::SlicedEllpack< RealType, DeviceType, IndexType > MatrixType;
       typedef SharedPointer< DifferentialOperator > DifferentialOperatorPointer;
       typedef SharedPointer< BoundaryCondition > BoundaryConditionPointer;
       typedef SharedPointer< RightHandSide, DeviceType > RightHandSidePointer;
@@ -90,12 +90,12 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       void bindDofs( const MeshPointer& meshPointer,
                      const DofVectorPointer& dofs );
 
-      void getExplicitRHS( const RealType& time,
-                           const RealType& tau,
-                           const MeshPointer& meshPointer,
-                           DofVectorPointer& _u,
-                           DofVectorPointer& _fu,
-                           MeshDependentDataPointer& meshDependentData );
+      void getExplicitUpdate( const RealType& time,
+                              const RealType& tau,
+                              const MeshPointer& meshPointer,
+                              DofVectorPointer& _u,
+                              DofVectorPointer& _fu,
+                              MeshDependentDataPointer& meshDependentData );
 
       template< typename MatrixPointer >
       void assemblyLinearSystem( const RealType& time,
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 38eaba46820fb9d037d71250a811481af605c7dd..49058f6a41084b374f0bb03d3ca39a692aa63069 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -69,8 +69,6 @@ bool
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 writeEpilog( Logger& logger )
 {
-   logger.writeParameter< const char* >( "GPU transfer time:", "" );
-   this->gpuTransferTimer.writeLog( logger, 1 );
    return true;
 }
 
@@ -204,7 +202,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshPointer& meshPointer,
                 DofVectorPointer& uDofs,
@@ -239,9 +237,9 @@ getExplicitRHS( const RealType& time,
       time + tau,
       this->uPointer );*/
    
-   //uPointer->write( "u.txt", "gnuplot" );
-   //fuPointer->write( "fu.txt", "gnuplot" );
-   //getchar();
+   /*uPointer->write( "u.txt", "gnuplot" );
+   fuPointer->write( "fu.txt", "gnuplot" );
+   getchar();*/
 }
 
 template< typename Mesh,
@@ -278,8 +276,9 @@ assemblyLinearSystem( const RealType& time,
       this->uPointer,
       matrixPointer,
       bPointer );
-   /*matrix.print( cout );
-   cout << endl << b << endl;
+   //matrixPointer->print( std::cout );
+   //getchar();
+   /*cout << endl << b << endl;
    cout << endl << u << endl;
    abort();*/
    /*cout << "Matrix multiplication test ..." << std::endl;
diff --git a/src/TNL/Problems/MeanCurvatureFlowProblem.h b/src/TNL/Problems/MeanCurvatureFlowProblem.h
index 02ac91c9b7f54fd5cce6d39a334b50a2904b3f41..af5a34df8c320b7b86d7b1d60acca95e2ad89364 100644
--- a/src/TNL/Problems/MeanCurvatureFlowProblem.h
+++ b/src/TNL/Problems/MeanCurvatureFlowProblem.h
@@ -75,7 +75,7 @@ class MeanCurvatureFlowProblem : public PDEProblem< Mesh,
       void bindDofs( const MeshType& mesh,
                      DofVectorType& dofs );
 
-      void getExplicitRHS( const RealType& time,
+      void getExplicitUpdate( const RealType& time,
                            const RealType& tau,
                            const MeshType& mesh,
                            DofVectorType& _u,
diff --git a/src/TNL/Problems/MeanCurvatureFlowProblem_impl.h b/src/TNL/Problems/MeanCurvatureFlowProblem_impl.h
index 6a52f2e950409735912ad94dc884dfdba15ad947..c9092b12ce64fa409ae83a758fbd32ac1694d909 100644
--- a/src/TNL/Problems/MeanCurvatureFlowProblem_impl.h
+++ b/src/TNL/Problems/MeanCurvatureFlowProblem_impl.h
@@ -178,7 +178,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 MeanCurvatureFlowProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshType& mesh,
                 DofVectorType& inDofs,
diff --git a/src/TNL/Problems/PDEProblem.h b/src/TNL/Problems/PDEProblem.h
index 612e325e3771ec2560c955fe3ff64a98e19f3920..431f6588828d2166f42bbc297eb533ee00dd260c 100644
--- a/src/TNL/Problems/PDEProblem.h
+++ b/src/TNL/Problems/PDEProblem.h
@@ -12,7 +12,7 @@
 
 #include <TNL/Problems/Problem.h>
 #include <TNL/SharedPointer.h>
-#include <TNL/Matrices/CSR.h>
+#include <TNL/Matrices/SlicedEllpack.h>
 
 namespace TNL {
 namespace Problems {
@@ -34,7 +34,7 @@ class PDEProblem : public Problem< Real, Device, Index >
       typedef SharedPointer< MeshType, DeviceType > MeshPointer;
       typedef Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
       typedef SharedPointer< DofVectorType, DeviceType > DofVectorPointer;
-      typedef Matrices::CSR< RealType, DeviceType, IndexType > MatrixType;
+      typedef Matrices::SlicedEllpack< RealType, DeviceType, IndexType > MatrixType;
       typedef Containers::Vector< RealType, DeviceType, IndexType > MeshDependentDataType;
       typedef SharedPointer< MeshDependentDataType, DeviceType > MeshDependentDataPointer;
 
diff --git a/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver.h b/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver.h
index 634b1615e21b41f490551ee706fe355225e5730e..cf74d48ffe03724360e3904e6d974815beb75697 100644
--- a/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver.h
+++ b/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver.h
@@ -104,7 +104,7 @@ class NavierStokesSolver
                                   const Vector& e );
 
    template< typename SolverVectorType >
-   void getExplicitRhs( const RealType& time,
+   void getExplicitUpdate( const RealType& time,
                         const RealType& tau,
                         SolverVectorType& u,
                         SolverVectorType& fu );
diff --git a/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver_impl.h b/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver_impl.h
index e96231f525ec7d4a8c6d4e801dca997388d7a02b..50f2138dd01edf6112ecd6d151d1776aa2910edf 100644
--- a/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver_impl.h
+++ b/src/TNL/Problems/cfd/navier-stokes/NavierStokesSolver_impl.h
@@ -340,15 +340,15 @@ template< typename AdvectionScheme,
    template< typename SolverVectorType >
 void NavierStokesSolver< AdvectionScheme,
                       DiffusionScheme,
-                      BoundaryConditions >::getExplicitRhs( const RealType& time,
+                      BoundaryConditions >::getExplicitUpdate( const RealType& time,
                                                             const RealType& tau,
                                                             SolverVectorType& u,
                                                             SolverVectorType& fu )
 {
-   Assert( this->advection, );
-   Assert( this->u1Viscosity, );
-   Assert( this->u2Viscosity, );
-   Assert( this->boundaryConditions, );
+   TNL_ASSERT( this->advection, );
+   TNL_ASSERT( this->u1Viscosity, );
+   TNL_ASSERT( this->u2Viscosity, );
+   TNL_ASSERT( this->boundaryConditions, );
 
    SharedVector< RealType, DeviceType, IndexType > dofs_rho, dofs_rho_u1, dofs_rho_u2, dofs_e,
                                                       rho_t, rho_u1_t, rho_u2_t, e_t;
@@ -454,7 +454,7 @@ void NavierStokesSolver< AdvectionScheme,
            continue;
         }
 
-        this->advection->getExplicitRhs( c,
+        this->advection->getExplicitUpdate( c,
                                          rho_t[ c ],
                                          rho_u1_t[ c ],
                                          rho_u2_t[ c ],
diff --git a/src/TNL/SharedPointer.h b/src/TNL/SharedPointer.h
index f9120f023a9a2c1e01116080386d40228f207451..a0308daf19dce085093ef75633f13083d91cb03f 100644
--- a/src/TNL/SharedPointer.h
+++ b/src/TNL/SharedPointer.h
@@ -1,20 +1,15 @@
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
 /***************************************************************************
                           SharedPointer.h  -  description
                              -------------------
     begin                : May 6, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
 #pragma once
 
 #include <TNL/Devices/Host.h>
@@ -47,13 +42,8 @@
 
 namespace TNL {
 
-/***
- * Use the lazy mode if you do not want to call the object constructor in the
- * shared pointer constructor. You may call it later via the method recreate.
- */
 template< typename Object,
-          typename Device = typename Object::DeviceType,
-          bool lazy = false >
+          typename Device = typename Object::DeviceType >
 class SharedPointer
 {
    static_assert( ! std::is_same< Device, void >::value, "The device cannot be void. You need to specify the device explicitly in your code." );
@@ -62,8 +52,8 @@ class SharedPointer
 /****
  * Specialization for Devices::Host
  */
-template< typename Object, bool lazy >
-class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
+template< typename Object >
+class SharedPointer< Object, Devices::Host > : public SmartPointer
 {
    private:
       // Convenient template alias for controlling the selection of copy- and
@@ -75,14 +65,14 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
 
       // friend class will be needed for templated assignment operators
-      template< typename Object_, typename Device_, bool lazy_ >
+      template< typename Object_, typename Device_ >
       friend class SharedPointer;
 
    public:
 
       typedef Object ObjectType;
       typedef Devices::Host DeviceType;
-      typedef SharedPointer< Object, Devices::Host, lazy > ThisType;
+      typedef SharedPointer< Object, Devices::Host > ThisType;
 
       template< typename... Args >
       explicit  SharedPointer( Args... args )
@@ -91,8 +81,7 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Creating shared pointer to " << demangle(typeid(ObjectType).name()) << std::endl;
 #endif
-         if( ! lazy )
-            this->allocate( args... );
+         this->allocate( args... );
       }
 
       // this is needed only to avoid the default compiler-generated constructor
@@ -103,9 +92,9 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
       }
 
       // conditional constructor for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer< Object_, DeviceType, lazy_ >& pointer )
+      SharedPointer( const SharedPointer< Object_, DeviceType >& pointer )
       : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
@@ -119,9 +108,9 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
       }
 
       // conditional constructor for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer< Object_, DeviceType, lazy_ >&& pointer )
+      SharedPointer( SharedPointer< Object_, DeviceType >&& pointer )
       : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
@@ -201,9 +190,9 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
       }
 
       // conditional operator for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const ThisType& operator=( const SharedPointer< Object_, DeviceType, lazy_ >& ptr )
+      const ThisType& operator=( const SharedPointer< Object_, DeviceType >& ptr )
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -221,9 +210,9 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
       }
 
       // conditional operator for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const ThisType& operator=( SharedPointer< Object_, DeviceType, lazy_ >&& ptr )
+      const ThisType& operator=( SharedPointer< Object_, DeviceType >&& ptr )
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -235,6 +224,11 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
       {
          return true;
       }
+      
+      void clear()
+      {
+         this->free();
+      }
 
       ~SharedPointer()
       {
@@ -282,8 +276,8 @@ class SharedPointer< Object, Devices::Host, lazy > : public SmartPointer
 /****
  * Specialization for CUDA
  */
-template< typename Object, bool lazy >
-class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
+template< typename Object >
+class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 {
    private:
       // Convenient template alias for controlling the selection of copy- and
@@ -295,22 +289,21 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
 
       // friend class will be needed for templated assignment operators
-      template< typename Object_, typename Device_, bool lazy_ >
+      template< typename Object_, typename Device_ >
       friend class SharedPointer;
 
    public:
 
       typedef Object ObjectType;
       typedef Devices::Cuda DeviceType;
-      typedef SharedPointer< Object, Devices::Cuda, lazy > ThisType;
+      typedef SharedPointer< Object, Devices::Cuda > ThisType;
 
       template< typename... Args >
       explicit  SharedPointer( Args... args )
       : pd( nullptr ),
         cuda_pointer( nullptr )
       {
-         if( ! lazy )
-            this->allocate( args... );
+         this->allocate( args... );
       }
 
       // this is needed only to avoid the default compiler-generated constructor
@@ -322,9 +315,9 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       }
 
       // conditional constructor for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer< Object_, DeviceType, lazy_ >& pointer )
+      SharedPointer( const SharedPointer< Object_, DeviceType >& pointer )
       : pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
       {
@@ -341,9 +334,9 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       }
 
       // conditional constructor for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer< Object_, DeviceType, lazy_ >&& pointer )
+      SharedPointer( SharedPointer< Object_, DeviceType >&& pointer )
       : pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
       {
@@ -412,8 +405,8 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       const Object& getData() const
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
             return this->pd->data;
          if( std::is_same< Device, Devices::Cuda >::value )
@@ -425,8 +418,8 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       Object& modifyData()
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
          {
             this->pd->maybe_modified = true;
@@ -450,9 +443,9 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       }
 
       // conditional operator for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const ThisType& operator=( const SharedPointer< Object_, DeviceType, lazy_ >& ptr )
+      const ThisType& operator=( const SharedPointer< Object_, DeviceType >& ptr )
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -479,9 +472,9 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
       }
 
       // conditional operator for non-const -> const data
-      template< typename Object_, bool lazy_,
+      template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const ThisType& operator=( SharedPointer< Object_, DeviceType, lazy_ >&& ptr )
+      const ThisType& operator=( SharedPointer< Object_, DeviceType >&& ptr )
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -505,7 +498,7 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
             std::cerr << "Synchronizing shared pointer: counter = " << this->pd->counter << ", type: " << demangle(typeid(Object).name()) << std::endl;
             std::cerr << "   ( " << sizeof( Object ) << " bytes, CUDA adress " << this->cuda_pointer << " )" << std::endl;
 #endif
-            Assert( this->cuda_pointer, );
+            TNL_ASSERT( this->cuda_pointer, );
             cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
             if( ! checkCudaDevice ) {
                return false;
@@ -518,6 +511,11 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
          return false;
 #endif
       }
+      
+      void clear()
+      {
+         this->free();
+      }      
 
       ~SharedPointer()
       {
@@ -563,14 +561,14 @@ class SharedPointer< Object, Devices::Cuda, lazy > : public SmartPointer
 
       void set_last_sync_state()
       {
-         Assert( this->pd, );
+         TNL_ASSERT( this->pd, );
          std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) );
          this->pd->maybe_modified = false;
       }
 
       bool modified()
       {
-         Assert( this->pd, );
+         TNL_ASSERT( this->pd, );
          // optimization: skip bitwise comparison if we're sure that the data is the same
          if( ! this->pd->maybe_modified )
             return false;
diff --git a/src/TNL/Solvers/BuildConfigTags.h b/src/TNL/Solvers/BuildConfigTags.h
index f479b3f9b80cd39b431561a926117073bda1aac0..190290299ed5a29cf3a5640c31a9e48d647045c2 100644
--- a/src/TNL/Solvers/BuildConfigTags.h
+++ b/src/TNL/Solvers/BuildConfigTags.h
@@ -25,7 +25,7 @@
 namespace TNL {
 namespace Solvers {   
 
-class tnlDefaultBuildConfigTag{};
+class DefaultBuildConfigTag{};
 
 /****
  * All devices are enabled by default. Those which are not available
diff --git a/src/TNL/Solvers/Linear/BICGStab.h b/src/TNL/Solvers/Linear/BICGStab.h
index b03037aca74199931eeabf0e2e68fb7fc42ef5ba..ce9828cbe70f35cb1615eaeec85d7f7116c82da8 100644
--- a/src/TNL/Solvers/Linear/BICGStab.h
+++ b/src/TNL/Solvers/Linear/BICGStab.h
@@ -39,8 +39,8 @@ class BICGStab : public Object,
    typedef typename Matrix::DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
    BICGStab();
 
diff --git a/src/TNL/Solvers/Linear/BICGStab_impl.h b/src/TNL/Solvers/Linear/BICGStab_impl.h
index 888ea1b77c5e55d103e751658f101b806c0ad784..4ab5f6979229ce57fa0c6bb4906fd4c896d875e7 100644
--- a/src/TNL/Solvers/Linear/BICGStab_impl.h
+++ b/src/TNL/Solvers/Linear/BICGStab_impl.h
@@ -26,6 +26,11 @@ template< typename Matrix,
           typename Preconditioner >
 BICGStab< Matrix, Preconditioner > :: BICGStab()
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();   
 }
 
 template< typename Matrix,
diff --git a/src/TNL/Solvers/Linear/CG.h b/src/TNL/Solvers/Linear/CG.h
index ac1f33fe46706f042f10ba61a58b238561ca26ea..2a7b2403da80dc9f0ad77d2ef3936a1fed8fb4a5 100644
--- a/src/TNL/Solvers/Linear/CG.h
+++ b/src/TNL/Solvers/Linear/CG.h
@@ -38,8 +38,8 @@ class CG : public Object,
    typedef typename Matrix::DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
 
    CG();
diff --git a/src/TNL/Solvers/Linear/CG_impl.h b/src/TNL/Solvers/Linear/CG_impl.h
index f1d8b10bddfcd7b77494480e3ee8ff7fe3ca38ff..ef83ce399de1249195e54f8bfa6a4e9e942d1bb1 100644
--- a/src/TNL/Solvers/Linear/CG_impl.h
+++ b/src/TNL/Solvers/Linear/CG_impl.h
@@ -18,6 +18,11 @@ template< typename Matrix,
           typename Preconditioner >
 CG< Matrix, Preconditioner > :: CG()
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();   
 }
 
 template< typename Matrix,
diff --git a/src/TNL/Solvers/Linear/CWYGMRES.h b/src/TNL/Solvers/Linear/CWYGMRES.h
index 7e909b0e187416d5b48608f83b83da232027a49d..13f8e514e537f27e9070020f23d530cf5f350a82 100644
--- a/src/TNL/Solvers/Linear/CWYGMRES.h
+++ b/src/TNL/Solvers/Linear/CWYGMRES.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          CWYGMRES.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <math.h>
@@ -27,8 +39,8 @@ public:
    typedef typename Matrix::DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
    typedef Containers::Vector< RealType, DeviceType, IndexType > DeviceVector;
    typedef Containers::Vector< RealType, Devices::Host, IndexType > HostVector;
 
diff --git a/src/TNL/Solvers/Linear/CWYGMRES_impl.h b/src/TNL/Solvers/Linear/CWYGMRES_impl.h
index 988e7b9ddc0a844fc12ed2d1927a29b999fc0b80..f15f4ab83532737e8fb712b9ce3dce626feb6d41 100644
--- a/src/TNL/Solvers/Linear/CWYGMRES_impl.h
+++ b/src/TNL/Solvers/Linear/CWYGMRES_impl.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          CWYGMRES.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <type_traits>
@@ -19,6 +31,11 @@ CWYGMRES()
   ldSize( 0 ),
   restarting( 10 )
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();   
 }
 
 template< typename Matrix,
@@ -98,7 +115,7 @@ bool
 CWYGMRES< Matrix, Preconditioner >::
 solve( const Vector& b, Vector& x )
 {
-   Assert( matrix, std::cerr << "No matrix was set in CWYGMRES. Call setMatrix() before solve()." << std::endl );
+   TNL_ASSERT( matrix, std::cerr << "No matrix was set in CWYGMRES. Call setMatrix() before solve()." << std::endl );
    if( restarting <= 0 )
    {
       std::cerr << "I have wrong value for the restarting of the CWYGMRES solver. It is set to " << restarting
@@ -434,8 +451,8 @@ hauseholder_apply_trunc( HostVector& out,
       // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
       RealType* host_yi = &YL[ i * (restarting + 1) ];
       RealType host_z[ i + 1 ];
-      if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting + 1 ) ||
-          ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ) )
+      if( ! Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting + 1 ) ||
+          ! Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ) )
       {
          std::cerr << "Failed to copy part of device vectors y_i or z to host buffer." << std::endl;
          throw 1;
@@ -616,8 +633,12 @@ bool CWYGMRES< Matrix, Preconditioner > :: setSize( IndexType _size, IndexType m
 {
    if( size == _size && restarting == m ) return true;
    size = _size;
-   // align each column to 256 bytes
-   ldSize = roundToMultiple( size, 256 / sizeof( RealType ) );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+      // align each column to 256 bytes - optimal for CUDA
+      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) );
+   else
+       // on the host, we add 1 to disrupt the cache false-sharing pattern
+      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1;
    restarting = m;
    if( ! r.setSize( size ) ||
        ! z.setSize( size ) ||
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index d36d6527e4cc582bd9466f505bff219211bb75e1..95fdbd9e3f9ceb80792ff7e6c90d537c13257a09 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -37,8 +37,8 @@ public:
    typedef typename Matrix::DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
    GMRES();
 
@@ -90,6 +90,7 @@ protected:
    IndexType size, restarting;
 
    MatrixPointer matrix;
+   
    PreconditionerPointer preconditioner;
 };
 
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 5fef76b9bdf590f3f849c8f2597e90621d77681b..15c887c16d1aa9665fe0102e3317fbbd6b7d67a1 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -10,6 +10,8 @@
 
 #pragma once
 
+#include "GMRES.h"
+
 namespace TNL {
 namespace Solvers {
 namespace Linear {
@@ -21,6 +23,11 @@ GMRES()
 : size( 0 ),
   restarting( 10 )
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();
 }
 
 template< typename Matrix,
@@ -100,7 +107,7 @@ bool
 GMRES< Matrix, Preconditioner >::
 solve( const Vector& b, Vector& x )
 {
-   Assert( matrix, std::cerr << "No matrix was set in GMRES. Call setMatrix() before solve()." << std::endl );
+   TNL_ASSERT( matrix, std::cerr << "No matrix was set in GMRES. Call setMatrix() before solve()." << std::endl );
    if( restarting <= 0 )
    {
       std::cerr << "I have wrong value for the restarting of the GMRES solver. It is set to " << restarting
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
index 1e4fbe49b00780a572afba8f7d905a7497414628..ecef1953923b3705de08ea5dd3c4d544dc6fd184 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h
@@ -1,3 +1,14 @@
+/***************************************************************************
+                          Diagonal.h  -  description
+                             -------------------
+    begin                : Dec 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
 
 #pragma once
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index 1033ab29c103b4afdd5bb3518f06689f482184a0..991b6a4858f9dcb7f58359835964d12d91c09c43 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -1,3 +1,14 @@
+/***************************************************************************
+                          Diagonal_impl.h  -  description
+                             -------------------
+    begin                : Dec 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
 
 #pragma once
 
@@ -46,7 +57,7 @@ update( const MatrixPointer& matrix )
 {
 //  std::cout << getType() << "->setMatrix()" << std::endl;
 
-   Assert( matrix->getRows() > 0 && matrix->getRows() == matrix->getColumns(), );
+   TNL_ASSERT( matrix->getRows() > 0 && matrix->getRows() == matrix->getColumns(), );
 
    if( diagonal.getSize() != matrix->getRows() )
       diagonal.setSize( matrix->getRows() );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/Dummy.h b/src/TNL/Solvers/Linear/Preconditioners/Dummy.h
index 291ca7e7568eec277d72ca1aee3ea833ad1394eb..5629568c78b0bf1571c6a2d25a46d4bdc0f3199e 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Dummy.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Dummy.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Object.h>
+#include <TNL/SharedPointer.h>
 
 namespace TNL {
 namespace Solvers {
@@ -26,7 +27,12 @@ class Dummy
    void update( const Matrix& matrix ) {}
 
    template< typename Vector1, typename Vector2 >
-   bool solve( const Vector1& b, Vector2& x ) const { return true; }
+   bool solve( const Vector1& b, Vector2& x ) const
+   {
+      TNL_ASSERT( false,
+              std::cerr << "The solve() method of a dummy preconditioner should not be called." << std::endl; );
+      return true;
+   }
 
    String getType() const
    {
@@ -39,7 +45,8 @@ class SolverStarterSolverPreconditionerSetter
 {
    public:
        
-      static void run( LinearSolver& solver, Preconditioner& preconditioner )
+      static void run( LinearSolver& solver,
+                       SharedPointer< Preconditioner, typename LinearSolver::DeviceType >& preconditioner )
       {
          solver.setPreconditioner( preconditioner );
       }
@@ -53,7 +60,8 @@ class SolverStarterSolverPreconditionerSetter< LinearSolver, Dummy< Real, Device
       typedef Device DeviceType;
       typedef Dummy< Real, DeviceType, Index > PreconditionerType;
    
-      static void run( LinearSolver& solver, PreconditionerType& preconditioner )
+      static void run( LinearSolver& solver,
+                       SharedPointer< PreconditionerType, typename LinearSolver::DeviceType >& preconditioner )
       {
          // do nothing
       }
diff --git a/src/TNL/Solvers/Linear/SOR.h b/src/TNL/Solvers/Linear/SOR.h
index 2a3528f05c86563f06d18aa705c71b94f0b71bb2..d1159884c1dc98883d782db832c936da71e03186 100644
--- a/src/TNL/Solvers/Linear/SOR.h
+++ b/src/TNL/Solvers/Linear/SOR.h
@@ -36,8 +36,8 @@ class SOR : public Object,
    typedef typename Matrix :: DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
    SOR();
 
diff --git a/src/TNL/Solvers/Linear/SOR_impl.h b/src/TNL/Solvers/Linear/SOR_impl.h
index 8b99ea392f6b94790bb0752767025284dc4cdd43..d7d883f161342a4311caf8a7a99411fc36683d17 100644
--- a/src/TNL/Solvers/Linear/SOR_impl.h
+++ b/src/TNL/Solvers/Linear/SOR_impl.h
@@ -18,6 +18,11 @@ template< typename Matrix, typename Preconditioner >
 SOR< Matrix, Preconditioner > :: SOR()
 : omega( 1.0 )
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();   
 }
 
 template< typename Matrix, typename Preconditioner >
diff --git a/src/TNL/Solvers/Linear/TFQMR.h b/src/TNL/Solvers/Linear/TFQMR.h
index 8e47d7c8316375bea9027fa9abc200bbf13d7e59..94d1caabbb3a24957be6e23f3ee864fff7c0d5e5 100644
--- a/src/TNL/Solvers/Linear/TFQMR.h
+++ b/src/TNL/Solvers/Linear/TFQMR.h
@@ -39,8 +39,8 @@ class TFQMR : public Object,
    typedef typename Matrix::DeviceType DeviceType;
    typedef Matrix MatrixType;
    typedef Preconditioner PreconditionerType;
-   typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-   typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+   typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+   typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
    TFQMR();
 
@@ -56,9 +56,9 @@ class TFQMR : public Object,
 
    void setPreconditioner( const PreconditionerPointer& preconditioner );
 
-   template< typename VectorPointer,
-             typename ResidueGetter = LinearResidueGetter< Matrix, typename VectorPointer::ObjectType >  >
-   bool solve( const VectorPointer& b, VectorPointer& x );
+   template< typename Vector,
+             typename ResidueGetter = LinearResidueGetter< Matrix, Vector >  >
+   bool solve( const Vector& b, Vector& x );
 
    ~TFQMR();
 
diff --git a/src/TNL/Solvers/Linear/TFQMR_impl.h b/src/TNL/Solvers/Linear/TFQMR_impl.h
index b04c45d3c7371ff6e6e89280d7f45df9bcbeff9d..9e335aaf1e1722cdaf6a188e18ead8e2897651c3 100644
--- a/src/TNL/Solvers/Linear/TFQMR_impl.h
+++ b/src/TNL/Solvers/Linear/TFQMR_impl.h
@@ -19,6 +19,11 @@ template< typename Matrix,
 TFQMR< Matrix, Preconditioner > :: TFQMR()
 : size( 0 )
 {
+   /****
+    * Clearing the shared pointer means that there is no
+    * preconditioner set.
+    */
+   this->preconditioner.clear();   
 }
 
 template< typename Matrix,
diff --git a/src/TNL/Solvers/Linear/UmfpackWrapper.h b/src/TNL/Solvers/Linear/UmfpackWrapper.h
index 043e46c8d72aa14caa95cc8ebdb9d93171adb6f3..a2519d7c133d1e7bb8dfa3b0f912c93586b841a9 100644
--- a/src/TNL/Solvers/Linear/UmfpackWrapper.h
+++ b/src/TNL/Solvers/Linear/UmfpackWrapper.h
@@ -1,4 +1,14 @@
+/***************************************************************************
+                          UmfpackWrapper.h  -  description
+                             -------------------
+    begin                : Mar 21, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
 
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
 
 #pragma once
 
@@ -9,7 +19,7 @@
 #include <TNL/Object.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Matrices/CSR.h>
-#include <TNL/Solvers/preconditioners/Dummy.h>
+#include <TNL/Solvers/Linear/Preconditioners/Dummy.h>
 #include <TNL/Solvers/IterativeSolver.h>
 #include <TNL/Solvers/Linear/LinearResidueGetter.h>
 
@@ -25,16 +35,16 @@ struct is_csr_matrix
 };
 
 template< typename Real, typename Device, typename Index >
-struct is_csr_matrix< CSR< Real, Device, Index > >
+struct is_csr_matrix< Matrices::CSR< Real, Device, Index > >
 {
     static const bool value = true;
 };
 
 
 template< typename Matrix,
-          typename Preconditioner = Dummy< typename Matrix :: RealType,
-                                           typename Matrix :: DeviceType,
-                                           typename Matrix :: IndexType> >
+          typename Preconditioner = Preconditioners::Dummy< typename Matrix :: RealType,
+                                                            typename Matrix :: DeviceType,
+                                                            typename Matrix :: IndexType> >
 class UmfpackWrapper
     : public Object,
       // just to ensure the same interface as other linear solvers
@@ -47,8 +57,8 @@ public:
     typedef typename Matrix :: DeviceType DeviceType;
     typedef Matrix MatrixType;
     typedef Preconditioner PreconditionerType;
-    typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-    typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+    typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+    typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
     UmfpackWrapper()
     {
@@ -88,7 +98,7 @@ public:
 
 
 template< typename Preconditioner >
-class UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >
+class UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >
     : public Object,
       // just to ensure the same interface as other linear solvers
       public IterativeSolver< double, int >
@@ -97,10 +107,10 @@ public:
     typedef double RealType;
     typedef int IndexType;
     typedef Devices::Host DeviceType;
-    typedef CSR< double, Devices::Host, int > MatrixType;
+    typedef Matrices::CSR< double, Devices::Host, int > MatrixType;
     typedef Preconditioner PreconditionerType;
-    typedef SharedPointer< const MatrixType, DeviceType, true > MatrixPointer;
-    typedef SharedPointer< const PreconditionerType, DeviceType, true > PreconditionerPointer;
+    typedef SharedPointer< const MatrixType, DeviceType > MatrixPointer;
+    typedef SharedPointer< const PreconditionerType, DeviceType > PreconditionerPointer;
 
     UmfpackWrapper();
 
@@ -132,5 +142,4 @@ protected:
 
 #include "UmfpackWrapper_impl.h"
 
-
 #endif
diff --git a/src/TNL/Solvers/Linear/UmfpackWrapper_impl.h b/src/TNL/Solvers/Linear/UmfpackWrapper_impl.h
index dcb34521c8872b3a751fc0402ec0762d40d091aa..a07143a573900b4369cae396e43ea4b1a0797cc4 100644
--- a/src/TNL/Solvers/Linear/UmfpackWrapper_impl.h
+++ b/src/TNL/Solvers/Linear/UmfpackWrapper_impl.h
@@ -1,4 +1,14 @@
+/***************************************************************************
+                          UmfpackWrapper_impl.h  -  description
+                             -------------------
+    begin                : Mar 21, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
 
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
 
 #pragma once
 
@@ -11,13 +21,13 @@ namespace Solvers {
 namespace Linear {   
 
 template< typename Preconditioner >
-UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 UmfpackWrapper()
 {}
 
 template< typename Preconditioner >
 void
-UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 configSetup( Config::ConfigDescription& config,
              const String& prefix )
 {
@@ -25,7 +35,7 @@ configSetup( Config::ConfigDescription& config,
 
 template< typename Preconditioner >
 bool
-UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 setup( const Config::ParameterContainer& parameters,
        const String& prefix )
 {
@@ -33,14 +43,14 @@ setup( const Config::ParameterContainer& parameters,
 }
 
 template< typename Preconditioner >
-void UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+void UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 setMatrix( const MatrixPointer& matrix )
 {
     this -> matrix = matrix;
 }
 
 template< typename Preconditioner >
-void UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+void UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 setPreconditioner( const PreconditionerPointer& preconditioner )
 {
     this -> preconditioner = preconditioner;
@@ -49,12 +59,12 @@ setPreconditioner( const PreconditionerPointer& preconditioner )
 
 template< typename Preconditioner >
     template< typename Vector, typename ResidueGetter >
-bool UmfpackWrapper< CSR< double, Devices::Host, int >, Preconditioner >::
+bool UmfpackWrapper< Matrices::CSR< double, Devices::Host, int >, Preconditioner >::
 solve( const Vector& b,
        Vector& x )
 {
-    Assert( matrix->getRows() == matrix->getColumns(), );
-    Assert( matrix->getColumns() == x.getSize() && matrix->getColumns() == b.getSize(), );
+    TNL_ASSERT( matrix->getRows() == matrix->getColumns(), );
+    TNL_ASSERT( matrix->getColumns() == x.getSize() && matrix->getColumns() == b.getSize(), );
 
     const IndexType size = matrix -> getRows();
 
@@ -77,9 +87,9 @@ solve( const Vector& b,
 
     // symbolic reordering of the sparse matrix
     status = umfpack_di_symbolic( size, size,
-                                  matrix->rowPointers.getData(),
-                                  matrix->columnIndexes.getData(),
-                                  matrix->values.getData(),
+                                  matrix->getRowPointers(),
+                                  matrix->getColumnIndexes(),
+                                  matrix->getValues(),
                                   &Symbolic, Control, Info );
     if( status != UMFPACK_OK ) {
         std::cerr << "error: symbolic reordering failed" << std::endl;
@@ -87,9 +97,9 @@ solve( const Vector& b,
     }
 
     // numeric factorization
-    status = umfpack_di_numeric( matrix->rowPointers.getData(),
-                                 matrix->columnIndexes.getData(),
-                                 matrix->values.getData(),
+    status = umfpack_di_numeric( matrix->getRowPointers(),
+                                 matrix->getColumnIndexes(),
+                                 matrix->getValues(),
                                  Symbolic, &Numeric, Control, Info );
     if( status != UMFPACK_OK ) {
         std::cerr << "error: numeric factorization failed" << std::endl;
@@ -98,9 +108,9 @@ solve( const Vector& b,
 
     // solve with specified right-hand-side
     status = umfpack_di_solve( system_type,
-                               matrix->rowPointers.getData(),
-                               matrix->columnIndexes.getData(),
-                               matrix->values.getData(),
+                               matrix->getRowPointers(),
+                               matrix->getColumnIndexes(),
+                               matrix->getValues(),
                                x.getData(),
                                b.getData(),
                                Numeric, Control, Info );
diff --git a/src/TNL/Solvers/MeshTypeResolver.h b/src/TNL/Solvers/MeshTypeResolver.h
index c0dae65549302b7cfc9b8d110a282c9072aafadc..604b2dfbe70e4286a6ba5bcb3d00fa39cf7874cc 100644
--- a/src/TNL/Solvers/MeshTypeResolver.h
+++ b/src/TNL/Solvers/MeshTypeResolver.h
@@ -51,20 +51,20 @@ class MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
    protected:
 
    static bool resolveMeshDimensions( const Config::ParameterContainer& parameters,
-                                      const List< String >& parsedMeshType );
+                                      const Containers::List< String >& parsedMeshType );
 
    // Overload for disabled dimensions
    template< int MeshDimensions,
              typename = typename std::enable_if< ! ConfigTagDimensions<ConfigTag,MeshDimensions>::enabled >::type,
              typename = void >
    static bool resolveMeshRealType( const Config::ParameterContainer& parameters,
-                                    const List< String >& parsedMeshType );
+                                    const Containers::List< String >& parsedMeshType );
 
    // Overload for enabled dimensions
    template< int MeshDimensions,
              typename = typename std::enable_if< ConfigTagDimensions<ConfigTag,MeshDimensions>::enabled >::type >
    static bool resolveMeshRealType( const Config::ParameterContainer& parameters,
-                                    const List< String >& parsedMeshType );
+                                    const Containers::List< String >& parsedMeshType );
 
    // Overload for disabled real types
    template< int MeshDimensions,
@@ -72,14 +72,14 @@ class MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
              typename = typename std::enable_if< ! ConfigTagReal<ConfigTag, MeshRealType>::enabled >::type,
              typename = void >
    static bool resolveMeshIndexType( const Config::ParameterContainer& parameters,
-                                     const List< String >& parsedMeshType );
+                                     const Containers::List< String >& parsedMeshType );
 
    // Overload for enabled real types
    template< int MeshDimensions,
              typename MeshRealType,
              typename = typename std::enable_if< ConfigTagReal<ConfigTag, MeshRealType>::enabled >::type >
    static bool resolveMeshIndexType( const Config::ParameterContainer& parameters,
-                                     const List< String >& parsedMeshType );
+                                     const Containers::List< String >& parsedMeshType );
 
    // Overload for disabled index types
    template< int MeshDimensions,
@@ -88,7 +88,7 @@ class MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
              typename = typename std::enable_if< ! ConfigTagIndex<ConfigTag, MeshIndexType>::enabled >::type,
              typename = void >
    static bool resolveMeshType( const Config::ParameterContainer& parameters,
-                                const List< String >& parsedMeshType );
+                                const Containers::List< String >& parsedMeshType );
 
    // Overload for enabled index types
    template< int MeshDimensions,
@@ -96,7 +96,7 @@ class MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >
              typename MeshIndexType,
              typename = typename std::enable_if< ConfigTagIndex<ConfigTag, MeshIndexType>::enabled >::type >
    static bool resolveMeshType( const Config::ParameterContainer& parameters,
-                                const List< String >& parsedMeshType );
+                                const Containers::List< String >& parsedMeshType );
 
 
 
@@ -115,7 +115,7 @@ class MeshTypeResolverDimensionsSupportChecker< Dimensions, true, MeshTypeResolv
    public:
 
    static bool checkDimensions( const Config::ParameterContainer& parameters,
-                                const List< String >& parsedMeshType );
+                                const Containers::List< String >& parsedMeshType );
 };
 
 template< int Dimensions, typename MeshTypeResolver >
@@ -124,7 +124,7 @@ class MeshTypeResolverDimensionsSupportChecker< Dimensions, false, MeshTypeResol
    public:
 
    static bool checkDimensions( const Config::ParameterContainer& parameters,
-                                const List< String >& parsedMeshType );
+                                const Containers::List< String >& parsedMeshType );
 };*/
 
 } // namespace Solvers
diff --git a/src/TNL/Solvers/MeshTypeResolver_impl.h b/src/TNL/Solvers/MeshTypeResolver_impl.h
index 2ef00775bac59eba58b6ecb672c7cf3ba6b17d04..d1b1b6d4263387a71f1192c2391b1c8f5217c3d0 100644
--- a/src/TNL/Solvers/MeshTypeResolver_impl.h
+++ b/src/TNL/Solvers/MeshTypeResolver_impl.h
@@ -58,8 +58,8 @@ bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::ru
       std::cerr << "I am not able to detect the mesh type from the file " << meshFileName << "." << std::endl;
       return EXIT_FAILURE;
    }
-  std::cout << meshType << " detected in " << meshFileName << " file." << std::endl;
-   List< String > parsedMeshType;
+   std::cout << meshType << " detected in " << meshFileName << " file." << std::endl;
+   Containers::List< String > parsedMeshType;
    if( ! parseObjectType( meshType, parsedMeshType ) )
    {
       std::cerr << "Unable to parse the mesh type " << meshType << "." << std::endl;
@@ -73,8 +73,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Device,
           typename Index,
           typename ConfigTag >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshDimensions( const Config::ParameterContainer& parameters,
-                                                                                                        const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshDimensions( const Config::ParameterContainer& parameters,
+                       const Containers::List< String >& parsedMeshType )
 {
    int dimensions = atoi( parsedMeshType[ 1 ].getString() );
 
@@ -94,8 +96,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Index,
           typename ConfigTag >
    template< int MeshDimensions, typename, typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshRealType( const Config::ParameterContainer& parameters,
-                                                                                                      const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshRealType( const Config::ParameterContainer& parameters,
+                     const Containers::List< String >& parsedMeshType )
 {
    std::cerr << "Mesh dimension " << MeshDimensions << " is not supported." << std::endl;
    return false;
@@ -107,8 +111,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
           typename Index,
           typename ConfigTag >
    template< int MeshDimensions, typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshRealType( const Config::ParameterContainer& parameters,
-                                                                                                      const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshRealType( const Config::ParameterContainer& parameters,
+                     const Containers::List< String >& parsedMeshType )
 {
    if( parsedMeshType[ 2 ] == "float" )
       return resolveMeshIndexType< MeshDimensions, float >( parameters, parsedMeshType );
@@ -128,8 +134,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
    template< int MeshDimensions,
              typename MeshRealType,
              typename, typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshIndexType( const Config::ParameterContainer& parameters,
-                                                                                                        const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshIndexType( const Config::ParameterContainer& parameters,
+                      const Containers::List< String >& parsedMeshType )
 {
    std::cerr << "The type '" << parsedMeshType[ 4 ] << "' is not allowed for real type." << std::endl;
    return false;
@@ -143,8 +151,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
    template< int MeshDimensions,
              typename MeshRealType,
              typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshIndexType( const Config::ParameterContainer& parameters,
-                                                                                                        const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshIndexType( const Config::ParameterContainer& parameters,
+                      const Containers::List< String >& parsedMeshType )
 {
    if( parsedMeshType[ 4 ] == "short int" )
       return resolveMeshType< MeshDimensions, MeshRealType, short int >( parameters, parsedMeshType );
@@ -165,8 +175,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
              typename MeshRealType,
              typename MeshIndexType,
              typename, typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshType( const Config::ParameterContainer& parameters,
-                                                                                                   const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshType( const Config::ParameterContainer& parameters,
+                 const Containers::List< String >& parsedMeshType )
 {
    std::cerr << "The type '" << parsedMeshType[ 4 ] << "' is not allowed for indexing type." << std::endl;
    return false;
@@ -181,8 +193,10 @@ template< template< typename Real, typename Device, typename Index, typename Mes
              typename MeshRealType,
              typename MeshIndexType,
              typename >
-bool MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::resolveMeshType( const Config::ParameterContainer& parameters,
-                                                                                                   const List< String >& parsedMeshType )
+bool
+MeshTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::
+resolveMeshType( const Config::ParameterContainer& parameters,
+                 const Containers::List< String >& parsedMeshType )
 {
    if( parsedMeshType[ 0 ] == "Meshes::Grid" )
    {
diff --git a/src/TNL/Solvers/ODE/Euler_impl.h b/src/TNL/Solvers/ODE/Euler_impl.h
index 46706cf24c8796ed6e92681cadbfed46fddad102..4008ccf54750517bd24894fcec8a2ffd05e46f13 100644
--- a/src/TNL/Solvers/ODE/Euler_impl.h
+++ b/src/TNL/Solvers/ODE/Euler_impl.h
@@ -105,7 +105,7 @@ bool Euler< Problem > :: solve( DofVectorPointer& u )
        * Compute the RHS
        */
       //timer.stop();
-      this->problem->getExplicitRHS( time, currentTau, u, k1 );
+      this->problem->getExplicitUpdate( time, currentTau, u, k1 );
       //timer.start();
 
       RealType lastResidue = this->getResidue();
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 868bc32b59ce652acd55c7f1c58c39a172478e45..2ecf5aa15aacbc16ac2c9c0282b15cc136413d76 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -271,35 +271,35 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
 
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
-      this->problem->getExplicitRHS( time, tau, u, k1 );
+      this->problem->getExplicitUpdate( time, tau, u, k1 );
 
    #ifdef HAVE_OPENMP
    #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
    #endif
       for( IndexType i = 0; i < size; i ++ )
          _kAux[ i ] = _u[ i ] + tau * ( 1.0 / 3.0 * _k1[ i ] );
-      this->problem->getExplicitRHS( time + tau_3, tau, kAux, k2 );
+      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
 
    #ifdef HAVE_OPENMP
    #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k2, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
    #endif
       for( IndexType i = 0; i < size; i ++ )
          _kAux[ i ] = _u[ i ] + tau * 1.0 / 6.0 * ( _k1[ i ] + _k2[ i ] );
-      this->problem->getExplicitRHS( time + tau_3, tau, kAux, k3 );
+      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
 
    #ifdef HAVE_OPENMP
    #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
    #endif
       for( IndexType i = 0; i < size; i ++ )
          _kAux[ i ] = _u[ i ] + tau * ( 0.125 * _k1[ i ] + 0.375 * _k3[ i ] );
-      this->problem->getExplicitRHS( time + 0.5 * tau, tau, kAux, k4 );
+      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
 
    #ifdef HAVE_OPENMP
    #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, _k4, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
    #endif
       for( IndexType i = 0; i < size; i ++ )
          _kAux[ i ] = _u[ i ] + tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] );
-      this->problem->getExplicitRHS( time + tau, tau, kAux, k5 );
+      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
    }
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
@@ -310,7 +310,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
       this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
       const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
 
-      this->problem->getExplicitRHS( time, tau, u, k1 );
+      this->problem->getExplicitUpdate( time, tau, u, k1 );
       cudaThreadSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
@@ -320,7 +320,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
       }
       cudaThreadSynchronize();
-      this->problem->getExplicitRHS( time + tau_3, tau, kAux, k2 );
+      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
       cudaThreadSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
@@ -330,7 +330,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
       }
       cudaThreadSynchronize();
-      this->problem->getExplicitRHS( time + tau_3, tau, kAux, k3 );
+      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
       cudaThreadSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
@@ -340,7 +340,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
       }
       cudaThreadSynchronize();
-      this->problem->getExplicitRHS( time + 0.5 * tau, tau, kAux, k4 );
+      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
       cudaThreadSynchronize();
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
@@ -350,7 +350,7 @@ void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
          computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
       }
       cudaThreadSynchronize();
-      this->problem->getExplicitRHS( time + tau, tau, kAux, k5 );
+      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
       cudaThreadSynchronize();
 #endif
    }
diff --git a/src/TNL/Solvers/PDE/ExplicitTimeStepper.h b/src/TNL/Solvers/PDE/ExplicitTimeStepper.h
index e51290d05efd5ffd63dd516b3e765c50da524501..381803f2f9d2d8a2bfed04b3022a2bb5898fbb05 100644
--- a/src/TNL/Solvers/PDE/ExplicitTimeStepper.h
+++ b/src/TNL/Solvers/PDE/ExplicitTimeStepper.h
@@ -68,7 +68,7 @@ class ExplicitTimeStepper
                DofVectorPointer& dofVector,
                MeshDependentDataPointer& meshDependentData );
 
-   void getExplicitRHS( const RealType& time,
+   void getExplicitUpdate( const RealType& time,
                         const RealType& tau,
                         DofVectorPointer& _u,
                         DofVectorPointer& _fu );
diff --git a/src/TNL/Solvers/PDE/ExplicitTimeStepper_impl.h b/src/TNL/Solvers/PDE/ExplicitTimeStepper_impl.h
index ea6973f7ba98a77e7d0ac31464c8fd634ce741fb..dff5f2e6c53b79ee348391d702fb4d53e32ef102 100644
--- a/src/TNL/Solvers/PDE/ExplicitTimeStepper_impl.h
+++ b/src/TNL/Solvers/PDE/ExplicitTimeStepper_impl.h
@@ -122,7 +122,7 @@ solve( const RealType& time,
        DofVectorPointer& dofVector,
        MeshDependentDataPointer& meshDependentData )
 {
-   Assert( this->odeSolver, );
+   TNL_ASSERT( this->odeSolver, );
    mainTimer.start();
    this->odeSolver->setTau( this->timeStep );
    this->odeSolver->setProblem( * this );
@@ -144,7 +144,7 @@ template< typename Problem,
           template < typename OdeProblem > class OdeSolver >
 void
 ExplicitTimeStepper< Problem, OdeSolver >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 DofVectorPointer& u,
                 DofVectorPointer& fu )
@@ -172,7 +172,7 @@ getExplicitRHS( const RealType& time,
 
    this->explicitUpdaterTimer.start();
    this->problem->setExplicitBoundaryConditions( time, *this->mesh, u, *this->meshDependentData );
-   this->problem->getExplicitRHS( time, tau, *this->mesh, u, fu, *this->meshDependentData );
+   this->problem->getExplicitUpdate( time, tau, *this->mesh, u, fu, *this->meshDependentData );
    this->explicitUpdaterTimer.stop();
 
    if( this->solverMonitor )
diff --git a/src/TNL/Solvers/PDE/ExplicitUpdater.h b/src/TNL/Solvers/PDE/ExplicitUpdater.h
index b6579d606607462cee6efd5f98d2acc8b2e6666a..d06ac77ffb379a57382d44c97b5e7ca7868cbfe6 100644
--- a/src/TNL/Solvers/PDE/ExplicitUpdater.h
+++ b/src/TNL/Solvers/PDE/ExplicitUpdater.h
@@ -126,7 +126,7 @@ class ExplicitUpdater
                                               TraverserUserData& userData,
                                               const EntityType& entity )
             {
-               ( *userData.fu )( entity ) =                
+               ( *userData.fu )( entity ) = 
                   ( *userData.differentialOperator )( *userData.u, entity, userData.time );
 
                typedef Functions::FunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
diff --git a/src/TNL/Solvers/PDE/ExplicitUpdater_impl.h b/src/TNL/Solvers/PDE/ExplicitUpdater_impl.h
index 02348de740711107376e705c307c2b4c49dd86eb..72b044e5e619087f4566d02ab8ae698b875e46f3 100644
--- a/src/TNL/Solvers/PDE/ExplicitUpdater_impl.h
+++ b/src/TNL/Solvers/PDE/ExplicitUpdater_impl.h
@@ -42,7 +42,7 @@ update( const RealType& time,
                                 Containers::Vector< typename MeshFunction::RealType,
                                            typename MeshFunction::DeviceType,
                                            typename MeshFunction::IndexType > >::value != true,
-      "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitRHS."  );
+      "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitUpdate."  );
    {
       //SharedPointer< TraverserUserData, DeviceType >
       this->userDataPointer->setUserData( time,
diff --git a/src/TNL/Solvers/PDE/LinearSystemAssembler_impl.h b/src/TNL/Solvers/PDE/LinearSystemAssembler_impl.h
index 3692920a6544f171d3d60b583adf2b9513e03a74..6f3ce27918d9d36f7eead7bf16c50a59569e3c2b 100644
--- a/src/TNL/Solvers/PDE/LinearSystemAssembler_impl.h
+++ b/src/TNL/Solvers/PDE/LinearSystemAssembler_impl.h
@@ -44,10 +44,10 @@ assembly( const RealType& time,
                                 Containers::Vector< typename MeshFunction::RealType,
                                            typename MeshFunction::DeviceType,
                                            typename MeshFunction::IndexType > >::value != true,
-      "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitRHS."  );
+      "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitUpdate."  );
 
    const IndexType maxRowLength = matrixPointer.template getData< Devices::Host >().getMaxRowLength();
-   Assert( maxRowLength > 0, );
+   TNL_ASSERT( maxRowLength > 0, );
 
    {
       this->userDataPointer->setUserData(
diff --git a/src/TNL/Solvers/PDE/PDESolver.h b/src/TNL/Solvers/PDE/PDESolver.h
index eafbf437076e62296cffae1b97d305a1913a0610..a4f47ef51c8e6472c8d4807254c98db8cdc5f661 100644
--- a/src/TNL/Solvers/PDE/PDESolver.h
+++ b/src/TNL/Solvers/PDE/PDESolver.h
@@ -87,7 +87,7 @@ class PDESolver : public Object
 
       DofVectorPointer dofsPointer;
 
-      MeshDependentDataPointer meshDependentData;
+      MeshDependentDataPointer meshDependentDataPointer;
 
       TimeStepper* timeStepper;
 
diff --git a/src/TNL/Solvers/PDE/PDESolver_impl.h b/src/TNL/Solvers/PDE/PDESolver_impl.h
index 5fde1f98be0b041e5e4138e58f79e63674e958a0..139d8fcc95169bd2938eb2ba9397b51425b962d4 100644
--- a/src/TNL/Solvers/PDE/PDESolver_impl.h
+++ b/src/TNL/Solvers/PDE/PDESolver_impl.h
@@ -71,7 +71,6 @@ setup( const Config::ParameterContainer& parameters,
    /****
     * Setup the problem
     */
-  
    if( ! problem->setup( this->meshPointer, parameters, prefix ) )
    {
       std::cerr << "The problem initiation failed!" << std::endl;
@@ -81,7 +80,7 @@ setup( const Config::ParameterContainer& parameters,
    /****
     * Set DOFs (degrees of freedom)
     */
-   Assert( problem->getDofs( this->meshPointer ) != 0, );
+   TNL_ASSERT( problem->getDofs( this->meshPointer ) != 0, );
    std::cout << "Allocating dofs ... ";
    if( ! this->dofsPointer->setSize( problem->getDofs( this->meshPointer ) ) )
    {
@@ -96,15 +95,15 @@ setup( const Config::ParameterContainer& parameters,
    /****
     * Set mesh dependent data
     */
-   this->problem->setMeshDependentData( this->meshPointer, this->meshDependentData );
-   this->problem->bindMeshDependentData( this->meshPointer, this->meshDependentData );
+   this->problem->setMeshDependentData( this->meshPointer, this->meshDependentDataPointer );
+   this->problem->bindMeshDependentData( this->meshPointer, this->meshDependentDataPointer );
    
    /***
     * Set-up the initial condition
     */
   std::cout << "Setting up the initial condition ... ";
    typedef typename Problem :: DofVectorType DofVectorType;
-   if( ! this->problem->setInitialCondition( parameters, meshPointer, this->dofsPointer, this->meshDependentData ) )
+   if( ! this->problem->setInitialCondition( parameters, meshPointer, this->dofsPointer, this->meshDependentDataPointer ) )
       return false;
   std::cout << " [ OK ]" << std::endl;
 
@@ -320,9 +319,9 @@ bool
 PDESolver< Problem, TimeStepper >::
 solve()
 {
-   Assert( timeStepper != 0,
+   TNL_ASSERT( timeStepper != 0,
               std::cerr << "No time stepper was set in PDESolver." );
-   Assert( problem != 0,
+   TNL_ASSERT( problem != 0,
               std::cerr << "No problem was set in PDESolver." );
 
    if( snapshotPeriod == 0 )
@@ -338,7 +337,7 @@ solve()
    this->computeTimer->reset();
  
    this->ioTimer->start();
-   if( ! this->problem->makeSnapshot( t, step, meshPointer, this->dofsPointer, this->meshDependentData ) )
+   if( ! this->problem->makeSnapshot( t, step, meshPointer, this->dofsPointer, this->meshDependentDataPointer ) )
    {
       std::cerr << "Making the snapshot failed." << std::endl;
       return false;
@@ -356,14 +355,14 @@ solve()
    {
       RealType tau = min( this->snapshotPeriod,
                           this->finalTime - t );
-      if( ! this->timeStepper->solve( t, t + tau, this->meshPointer, this->dofsPointer, this->meshDependentData ) )
+      if( ! this->timeStepper->solve( t, t + tau, this->meshPointer, this->dofsPointer, this->meshDependentDataPointer ) )
          return false;
       step ++;
       t += tau;
 
       this->ioTimer->start();
       this->computeTimer->stop();
-      if( ! this->problem->makeSnapshot( t, step, this->meshPointer, this->dofsPointer, this->meshDependentData ) )
+      if( ! this->problem->makeSnapshot( t, step, this->meshPointer, this->dofsPointer, this->meshDependentDataPointer ) )
       {
          std::cerr << "Making the snapshot failed." << std::endl;
          return false;
diff --git a/src/TNL/Solvers/PDE/SemiImplicitTimeStepper_impl.h b/src/TNL/Solvers/PDE/SemiImplicitTimeStepper_impl.h
index b0277034d19db2f98ec4976cdffe8259e0ea5bcd..6e996c2828e8b63200cf031eb53bef5d1728e251 100644
--- a/src/TNL/Solvers/PDE/SemiImplicitTimeStepper_impl.h
+++ b/src/TNL/Solvers/PDE/SemiImplicitTimeStepper_impl.h
@@ -154,11 +154,11 @@ solve( const RealType& time,
        DofVectorPointer& dofVector,
        MeshDependentDataPointer& meshDependentData )
 {
-   Assert( this->problem != 0, );
+   TNL_ASSERT( this->problem != 0, );
    RealType t = time;
    this->linearSystemSolver->setMatrix( this->matrix );
    PreconditionerPointer preconditioner;
-   Linear::Preconditioners::SolverStarterSolverPreconditionerSetter< LinearSystemSolverType, PreconditionerPointer >
+   Linear::Preconditioners::SolverStarterSolverPreconditionerSetter< LinearSystemSolverType, PreconditionerType >
        ::run( *(this->linearSystemSolver), preconditioner );
 
    while( t < stopTime )
diff --git a/src/TNL/Solvers/Solver.h b/src/TNL/Solvers/Solver.h
index b1d67352352a5cced97aa9b9bf91d8f0a0ba13b9..0d5925a1eb80b1fc7a81c94d129c70c5de242545 100644
--- a/src/TNL/Solvers/Solver.h
+++ b/src/TNL/Solvers/Solver.h
@@ -17,7 +17,7 @@ namespace Solvers {
 
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter > class ProblemSetter,
           template< typename ConfTag > class ProblemConfig,
-          typename ConfigTag = tnlDefaultBuildConfigTag >
+          typename ConfigTag = DefaultBuildConfigTag >
 class Solver
 {
    public:
diff --git a/src/TNL/Solvers/SolverConfig_impl.h b/src/TNL/Solvers/SolverConfig_impl.h
index 76234b9880e45db4d023fa898622bef75ff361d7..eb981bb35b9ade583e8538376978443e04584e14 100644
--- a/src/TNL/Solvers/SolverConfig_impl.h
+++ b/src/TNL/Solvers/SolverConfig_impl.h
@@ -116,7 +116,7 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
       if( ConfigTagSemiImplicitSolver< ConfigTag, SemiImplicitSORSolverTag >::enabled )
          config.addEntryEnum( "sor" );
 #ifdef HAVE_UMFPACK
-      if( MeshConfigSemiImplicitSolver< MeshConfig, SemiImplicitUmfpackSolverTag >::enabled )
+      if( ConfigTagSemiImplicitSolver< ConfigTag, SemiImplicitUmfpackSolverTag >::enabled )
          config.addEntryEnum( "umfpack" );
 #endif
    }
diff --git a/src/TNL/Solvers/SolverMonitor.h b/src/TNL/Solvers/SolverMonitor.h
index cbd8bef7240f01a0878dd53e2b03c6179b284e53..46911415f469aa16f52ac683607e598e2736cd2e 100644
--- a/src/TNL/Solvers/SolverMonitor.h
+++ b/src/TNL/Solvers/SolverMonitor.h
@@ -24,7 +24,8 @@ class SolverMonitor
 
    SolverMonitor()
       : timeout_milliseconds(500),
-        stopped(true)
+        stopped(true),
+        timer(nullptr)
    {};
 
    ~SolverMonitor() {};
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index 50c3e3bad93cb3870659a9fffa2ce36f8526fcff..2d988c8b4120920ca346dd7e8e205b43f64660f9 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -79,12 +79,6 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame
        ! Devices::Cuda::setup( parameters ) )
       return false;
    Problem problem;
-   /*if( ! problem.setup( parameters ) )
-   {
-      std::cerr << "The problem initiation failed!" << std::endl;
-      return false;
-   }*/
-
    return tnlUserDefinedTimeDiscretisationSetter< Problem, ConfigTag >::run( problem, parameters );
 }
 
@@ -484,6 +478,11 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver&
       return false;
    logger.writeParameter< const char* >( "Compute time:", "" );
    this->computeTimer.writeLog( logger, 1 );
+   if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value )
+   {
+      logger.writeParameter< const char* >( "GPU synchronization time:", "" );
+      TNL::Devices::Cuda::smartPointersSynchronizationTimer.writeLog( logger, 1 );
+   }   
    logger.writeParameter< const char* >( "I/O time:", "" );
    this->ioTimer.writeLog( logger, 1 );
    logger.writeParameter< const char* >( "Total time:", "" );
diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h
index 19eaeaba77b60361c01c9a1e190579bd1f9e4f33..322c509852749fca2b063e54707ae776c7096b8a 100644
--- a/src/TNL/StaticFor.h
+++ b/src/TNL/StaticFor.h
@@ -129,7 +129,7 @@ class StaticFor
                          StaticForIndexTag< IndexType, end - begin >,
                          LoopBody >::exec();
 #else
-     Assert( false, );
+     TNL_ASSERT( false, );
 #endif
    }
 
@@ -143,7 +143,7 @@ class StaticFor
                          StaticForIndexTag< IndexType, end - begin >,
                          LoopBody >::exec( p );
 #else
-     Assert( false, );
+     TNL_ASSERT( false, );
 #endif
    }
 
@@ -158,7 +158,7 @@ class StaticFor
                          StaticForIndexTag< IndexType, end - begin >,
                          LoopBody >::exec( p0, p1 );
 #else
-     Assert( false, );
+     TNL_ASSERT( false, );
 #endif
    }
 
@@ -174,7 +174,7 @@ class StaticFor
                          StaticForIndexTag< IndexType, end - begin >,
                          LoopBody >::exec( p0, p1, p2 );
 #else
-     Assert( false, );
+     TNL_ASSERT( false, );
 #endif
    }
 
@@ -191,7 +191,7 @@ class StaticFor
                          StaticForIndexTag< IndexType, end - begin >,
                          LoopBody >::exec( p0, p1, p2, p3 );
 #else
-     Assert( false, );
+     TNL_ASSERT( false, );
 #endif
    }
 };
diff --git a/src/TNL/String.cpp b/src/TNL/String.cpp
index 723f024a22e623facf03a9994bfe55f81ac6d6c9..62d932c6951913957577dbcdcdb0413a61c27bcb 100644
--- a/src/TNL/String.cpp
+++ b/src/TNL/String.cpp
@@ -13,7 +13,7 @@
 #include <assert.h>
 #include <TNL/String.h>
 #include <TNL/Assert.h>
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/File.h>
 #include <TNL/Math.h>
 #ifdef HAVE_MPI
@@ -55,6 +55,12 @@ String :: String( int number )
    this->setString( convertToString( number ).getString() );
 }
 
+String :: String( unsigned long int number )
+: string( 0 ), length( 0 )
+{
+   this->setString( convertToString( number ).getString() );
+}
+
 String :: String( long int number )
 : string( 0 ), length( 0 )
 {
@@ -104,21 +110,21 @@ void String :: setString( const char* c, int prefix_cut_off, int sufix_cut_off )
       length = STRING_PAGE * ( _length / STRING_PAGE + 1 );
       string = new char[ length ];
    }
-   Assert( string, );
+   TNL_ASSERT( string, );
    memcpy( string, c + min( c_len, prefix_cut_off ), sizeof( char ) * ( _length ) );
    string[ _length ] = 0;
 }
 
 const char& String :: operator[]( int i ) const
 {
-   Assert( i >= 0 && i < length,
+   TNL_ASSERT( i >= 0 && i < length,
               std::cerr << "Accessing char outside the string." );
    return string[ i ];
 }
 
 char& String :: operator[]( int i )
 {
-   Assert( i >= 0 && i < length,
+   TNL_ASSERT( i >= 0 && i < length,
               std::cerr << "Accessing char outside the string." );
    return string[ i ];
 }
@@ -269,6 +275,23 @@ replace( const String& pattern,
    this->string = newString;
 }
 
+String
+String::strip( char strip ) const
+{
+   int prefix_cut_off = 0;
+   int sufix_cut_off = 0;
+
+   while( prefix_cut_off < getLength() && (*this)[ prefix_cut_off ] == strip )
+      prefix_cut_off++;
+
+   while( sufix_cut_off < getLength() && (*this)[ getLength() - 1 - sufix_cut_off ] == strip )
+      sufix_cut_off++;
+
+   if( prefix_cut_off + sufix_cut_off < getLength() )
+      return String( getString(), prefix_cut_off, sufix_cut_off );
+   return "";
+}
+
 
 const char* String :: getString() const
 {
@@ -323,7 +346,7 @@ bool String :: load( std::istream& file )
 
 bool String :: save( File& file ) const
 {
-   Assert( string,
+   TNL_ASSERT( string,
               std::cerr << "string = " << string );
 
    int len = strlen( string );
@@ -421,7 +444,7 @@ bool String :: getLine( std::istream& stream )
    return true;
 }
 
-int String :: parse( List< String >& list, const char separator ) const
+int String :: parse( Containers::List< String >& list, const char separator ) const
 {
    list.reset();
    String copy( *this );
diff --git a/src/TNL/String.h b/src/TNL/String.h
index 45d94d13b954f93c276320600b442cc872ba9df4..3dd407e24b692340da2e622e179746eb9126553a 100644
--- a/src/TNL/String.h
+++ b/src/TNL/String.h
@@ -18,8 +18,10 @@
 
 namespace TNL {
 
-template< class T > class List;
 class File;
+namespace Containers {
+   template< class T > class List;
+}
 
 //! Class for managing strings
 class String
@@ -53,6 +55,8 @@ class String
 
    String( int number );
  
+   String( unsigned long int number );
+
    String( long int number );
 
    String( float number );
@@ -126,6 +130,8 @@ class String
    void replace( const String& pattern,
                  const String& replaceWith );
 
+   String strip( char strip = ' ' ) const;
+
    // TODO: remove
    //! Write to a binary file
    bool save( std::ostream& file ) const;
@@ -147,7 +153,7 @@ class String
    bool getLine( std::istream& stream );
 
    //! Parse the string into list of strings w.r.t. given separator.
-   int parse( List< String >& list, const char separator = ' ' ) const;
+   int parse( Containers::List< String >& list, const char separator = ' ' ) const;
 
    friend std::ostream& operator << ( std::ostream& stream, const String& str );
 };
diff --git a/src/TNL/SystemInfo.cpp b/src/TNL/SystemInfo.cpp
deleted file mode 100644
index bbc23b5e211d3a96a1be1eeb081268acdd9ad43f..0000000000000000000000000000000000000000
--- a/src/TNL/SystemInfo.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <set>
-#include <iomanip>
-#include <cstring>
-#include <ctime>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "SystemInfo.h"
-
-namespace TNL {
-
-SystemInfo::SystemInfo()
-{
-   uname( &uts );
-   parseCPUInfo();
-}
-
-void
-SystemInfo::parseCPUInfo( void )
-{
-   std::ifstream file( "/proc/cpuinfo" );
-   if( ! file ) {
-      std::cerr << "Unable to read information from /proc/cpuinfo." << std::endl;
-      return;
-   }
-
-   char line[ 1024 ];
-   std::set< int > processors;
-   while( ! file. eof() )
-   {
-      int i;
-      file.getline( line, 1024 );
-      if( strncmp( line, "physical id", strlen( "physical id" ) ) == 0 )
-      {
-         i = strlen( "physical id" );
-         while( line[ i ] != ':' && line[ i ] ) i ++;
-         processors.insert( atoi( &line[ i + 1 ] ) );
-         continue;
-      }
-      // FIXME: the rest does not work on heterogeneous multi-socket systems
-      if( strncmp( line, "model name", strlen( "model name" ) ) == 0 )
-      {
-         i = strlen( "model name" );
-         while( line[ i ] != ':' && line[ i ] ) i ++;
-         CPUModelName.setString( &line[ i + 1 ] );
-         continue;
-      }
-      if( strncmp( line, "cpu cores", strlen( "cpu cores" ) ) == 0 )
-      {
-         i = strlen( "cpu MHz" );
-         while( line[ i ] != ':' && line[ i ] ) i ++;
-         CPUCores = atoi( &line[ i + 1 ] );
-         continue;
-      }
-      if( strncmp( line, "siblings", strlen( "siblings" ) ) == 0 )
-      {
-         i = strlen( "siblings" );
-         while( line[ i ] != ':' && line[ i ] ) i ++;
-         CPUThreads = atoi( &line[ i + 1 ] );
-      }
-   }
-   numberOfProcessors = processors.size();
-}
-
-String
-SystemInfo::getHostname( void ) const
-{
-   char host_name[ 256 ];
-   gethostname( host_name, 255 );
-   return String( host_name );
-}
-
-String
-SystemInfo::getArchitecture( void ) const
-{
-   return String( uts.machine );
-}
-
-String
-SystemInfo::getSystemName( void ) const
-{
-   return String( uts.sysname );
-}
-
-String
-SystemInfo::getSystemRelease( void ) const
-{
-   return String( uts.release );
-}
-
-String
-SystemInfo::getCurrentTime( const char* format ) const
-{
-   const std::time_t time_since_epoch = std::time( nullptr );
-   std::tm* localtime = std::localtime( &time_since_epoch );
-   // TODO: use std::put_time in the future (available since GCC 5)
-//   std::stringstream ss;
-//   ss << std::put_time( localtime, format );
-//   return String( ss.str().c_str() );
-   char buffer[1024];
-   std::strftime( buffer, 1024, format, localtime );
-   return String( buffer );
-}
-
-int
-SystemInfo::getNumberOfProcessors( void ) const
-{
-   return numberOfProcessors;
-}
-
-String
-SystemInfo::getOnlineCPUs( void ) const
-{
-   std::string online = readFile< std::string >( "/sys/devices/system/cpu/online" );
-   return String( online.c_str() );
-}
-
-int
-SystemInfo::getNumberOfCores( int cpu_id ) const
-{
-   return CPUCores;
-}
-
-int
-SystemInfo::getNumberOfThreads( int cpu_id ) const
-{
-   return CPUThreads;
-}
-
-String
-SystemInfo::getCPUModelName( int cpu_id ) const
-{
-   return CPUModelName;
-}
-
-int
-SystemInfo::getCPUMaxFrequency( int cpu_id ) const
-{
-   String fileName( "/sys/devices/system/cpu/cpu" );
-   fileName += String( cpu_id ) + "/cpufreq/cpuinfo_max_freq";
-   return readFile< int >( fileName );
-}
-
-tnlCacheSizes
-SystemInfo::getCPUCacheSizes( int cpu_id ) const
-{
-   String directory( "/sys/devices/system/cpu/cpu" );
-   directory += String( cpu_id ) + "/cache";
-
-   tnlCacheSizes sizes;
-   for( int i = 0; i <= 3; i++ ) {
-      const String cache = directory + "/index" + String( i );
-
-      // check if the directory exists
-      struct stat st;
-      if( stat( cache.getString(), &st ) != 0 || ! S_ISDIR( st.st_mode ) )
-         break;
-
-      const int level = readFile< int >( cache + "/level" );
-      const std::string type = readFile< std::string >( cache + "/type" );
-      const int size = readFile< int >( cache + "/size" );
-
-      if( level == 1 && type == "Instruction" )
-         sizes.L1instruction = size;
-      else if( level == 1 && type == "Data" )
-         sizes.L1data = size;
-      else if( level == 2 )
-         sizes.L2 = size;
-      else if( level == 3 )
-         sizes.L3 = size;
-   }
-   return sizes;
-}
-
-} // namespace TNL
diff --git a/src/TNL/SystemInfo.h b/src/TNL/SystemInfo.h
deleted file mode 100644
index 6d0ce5cd9941d4431b209b5e47feb821620daee9..0000000000000000000000000000000000000000
--- a/src/TNL/SystemInfo.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-#include <fstream>
-#include <sstream>
-
-#include <sys/utsname.h>
-
-#include <TNL/String.h>
-
-namespace TNL {
-
-// TODO: Move this to Devices::Host
-struct tnlCacheSizes {
-   int L1instruction = 0;
-   int L1data = 0;
-   int L2 = 0;
-   int L3 = 0;
-};
-
-class SystemInfo
-{
-public:
-   SystemInfo();
-
-   String getHostname( void ) const;
-   String getArchitecture( void ) const;
-   String getSystemName( void ) const;
-   String getSystemRelease( void ) const;
-   String getCurrentTime( const char* format = "%a %b %d %Y, %H:%M:%S" ) const;
-
-   int getNumberOfProcessors( void ) const;
-   String getOnlineCPUs( void ) const;
-   int getNumberOfCores( int cpu_id ) const;
-   int getNumberOfThreads( int cpu_id ) const;
-   String getCPUModelName( int cpu_id ) const;
-   int getCPUMaxFrequency( int cpu_id ) const;
-   tnlCacheSizes getCPUCacheSizes( int cpu_id ) const;
-
-protected:
-   struct utsname uts;
-   int numberOfProcessors = 0;
-   String CPUModelName;
-   int CPUThreads = 0;
-   int CPUCores = 0;
-
-   void parseCPUInfo( void );
-
-   template< typename ResultType >
-   ResultType
-   readFile( const String & fileName ) const
-   {
-      std::ifstream file( fileName.getString() );
-      if( ! file ) {
-         std::cerr << "Unable to read information from " << fileName << "." << std::endl;
-         return 0;
-      }
-      ResultType result;
-      file >> result;
-      return result;
-   }
-};
-
-} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Timer.cpp b/src/TNL/Timer.cpp
index 06a50358017c631b9710d5e88b1fd093e43fc4a3..dc561202e7f6f7b759cc599edba7e37d2a1b7362 100644
--- a/src/TNL/Timer.cpp
+++ b/src/TNL/Timer.cpp
@@ -9,6 +9,7 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #include <TNL/Timer.h>
+#include <TNL/Logger.h>
 
 #include <TNL/tnlConfig.h>
 #ifdef HAVE_SYS_RESOURCE_H
diff --git a/src/TNL/Timer.h b/src/TNL/Timer.h
index 58e35e03d1636178ac6907ade662b8217447639a..5019ab46f3bf2ff0dcef2bed65d96ac9af848457 100644
--- a/src/TNL/Timer.h
+++ b/src/TNL/Timer.h
@@ -11,10 +11,10 @@
 
 #pragma once
 
-#include <TNL/Logger.h>
-
 namespace TNL {
 
+class Logger;
+
 class Timer
 {
    public:
diff --git a/src/TNL/UniquePointer.h b/src/TNL/UniquePointer.h
index 82e0ff11f5d832099c0def3155bc0f07f2b2521e..f8e85e04756e218463d7d6f21c85792238f0cb79 100644
--- a/src/TNL/UniquePointer.h
+++ b/src/TNL/UniquePointer.h
@@ -1,20 +1,15 @@
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
 /***************************************************************************
                           UniquePointer.h  -  description
                              -------------------
     begin                : May 6, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
 #pragma once
 
 #include <TNL/Devices/Host.h>
@@ -162,8 +157,8 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
       const Object& getData() const
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
             return this->pd->data;
          if( std::is_same< Device, Devices::Cuda >::value )
@@ -174,8 +169,8 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
       Object& modifyData()
       {
          static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
-         Assert( this->pd, );
-         Assert( this->cuda_pointer, );
+         TNL_ASSERT( this->pd, );
+         TNL_ASSERT( this->cuda_pointer, );
          if( std::is_same< Device, Devices::Host >::value )
          {
             this->pd->maybe_modified = true;
@@ -258,14 +253,14 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       void set_last_sync_state()
       {
-         Assert( this->pd, );
+         TNL_ASSERT( this->pd, );
          std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( ObjectType ) );
          this->pd->maybe_modified = false;
       }
 
       bool modified()
       {
-         Assert( this->pd, );
+         TNL_ASSERT( this->pd, );
          // optimization: skip bitwise comparison if we're sure that the data is the same
          if( ! this->pd->maybe_modified )
             return false;
diff --git a/src/TNL/core/CMakeLists.txt b/src/TNL/core/CMakeLists.txt
deleted file mode 100755
index 88d96b42eb1c3e38a03cf8c352623d625d1fbfc0..0000000000000000000000000000000000000000
--- a/src/TNL/core/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-set (headers 
-             tnlConstants.h
-             tnlIndexedSet.h
-             mfilename.h 
-             mfuncs.h 
-             mpi-supp.h 
-             param-types.h )
-
-SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/core )
-set( common_SOURCES
-     ${CURRENT_DIR}/mfilename.cpp 
-     ${CURRENT_DIR}/mpi-supp.cpp )
-
-IF( BUILD_CUDA )
-   set( tnl_core_CUDA__SOURCES
-        ${tnl_core_arrays_CUDA__SOURCES}
-        ${tnl_core_containers_CUDA__SOURCES}
-        ${tnl_core_cuda_CUDA__SOURCES}
-        ${tnl_core_vectors_CUDA__SOURCES}
-        ${tnl_core_images_CUDA__SOURCES}
-        ${common_SOURCES} 
-        PARENT_SCOPE )
-ENDIF()    
-
-set( tnl_core_SOURCES     
-     ${tnl_core_arrays_SOURCES}
-     ${tnl_core_containers_SOURCES}
-     ${tnl_core_cuda_SOURCES}
-     ${tnl_core_vectors_SOURCES}
-     ${tnl_core_images_SOURCES}
-     ${common_SOURCES}
-     PARENT_SCOPE )
-    
-
-#SET( libtnlcoreincludedir ${TNL_INCLUDE_DIR}/core )
-#SET( libtnlcoreinclude_HEADERS ${headers} )
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/TNL/core )
-
-
-
diff --git a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h b/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h
index c3784b64e7b2e621da62b02775974bbf0ef7f68c..33153a50ff0e192db596897a7b701ace99171f68 100644
--- a/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h
+++ b/src/TNL/legacy/benchmarks/matrix-solvers-benchmark.h
@@ -357,7 +357,7 @@ int main( int argc, char* argv[] )
       std::cerr << "Unable to detect object type in " << inputFile << std::endl;
       return EXIT_FAILURE;
    }
-   List< String > parsedObjectType;
+   Containers::List< String > parsedObjectType;
    parseObjectType( objectType,
                     parsedObjectType );
    String objectClass = parsedObjectType[ 0 ];
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
index 767e9404e2215221742a2d3ba8c9d33902e5b3bb..846a0c8cc7590f9d332073faf3c0011a5fdb2840 100644
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
+++ b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
@@ -77,7 +77,7 @@ template< typename Real,
           typename Index>
 bool tnlSpmvBenchmarkAdaptiveRgCSR< Real, Device, Index > :: setup( const CSR< Real, Devices::Host, Index >& matrix )
 {
-   //Assert( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
+   //TNL_ASSERT( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
    if( Device :: getDevice() == Devices::HostDevice )
    {
       this->matrix. tuneFormat( desiredChunkSize, cudaBlockSize );
diff --git a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h
index e8b04c87073bdc35c2cc45ff65571be53affbfe4..b0be71e993fde254731aef550d5916401eec073c 100644
--- a/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h
+++ b/src/TNL/legacy/benchmarks/tnlSpmvBenchmarkRgCSRMatrix.h
@@ -69,7 +69,7 @@ template< typename Real,
           typename Index>
 bool tnlSpmvBenchmarkRgCSR< Real, Device, Index > :: setup( const CSR< Real, Devices::Host, Index >& csrMatrix )
 {
-   Assert( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
+   TNL_ASSERT( this->groupSize > 0, std::cerr << "groupSize = " << this->groupSize );
    if( Device :: getDevice() == Devices::HostDevice )
    {
       this->matrix. tuneFormat( groupSize,
diff --git a/src/TNL/legacy/tnl-benchmarks.h b/src/TNL/legacy/tnl-benchmarks.h
index 5e97a98bd32ac6f11cb3d0b7ed757e0d2520570a..df9af1cf43231d36e5460cd410b3c7756529ada9 100644
--- a/src/TNL/legacy/tnl-benchmarks.h
+++ b/src/TNL/legacy/tnl-benchmarks.h
@@ -93,7 +93,7 @@ void tnlCPUReductionMin( const Vector< T >& host_vector,
 {
    const T* data = host_vector. Data();
    const int size = host_vector. GetSize();
-   //Assert( data );
+   //TNL_ASSERT( data );
    min = data[ 0 ];
    for( int i = 1; i < size; i ++ )
       min = :: min( min,  data[ i ] );
@@ -105,7 +105,7 @@ void tnlCPUReductionMax( const Vector< T >& host_vector,
 {
    const T* data = host_vector. Data();
    const int size = host_vector. GetSize();
-   //Assert( data );
+   //TNL_ASSERT( data );
    max = data[ 0 ];
    for( int i = 1; i < size; i ++ )
       max = :: max( max,  data[ i ] );
diff --git a/src/Tools/python-path-test.py b/src/Tools/python-path-test.py
deleted file mode 100644
index 27ac980f994367b58d182385da864695a0b3aa18..0000000000000000000000000000000000000000
--- a/src/Tools/python-path-test.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import TNL
-
-print( "OK" )
diff --git a/src/Tools/src/CMakeLists.txt b/src/Tools/src/CMakeLists.txt
deleted file mode 100755
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/src/Tools/tnl-dicom-reader.cpp b/src/Tools/tnl-dicom-reader.cpp
index f7e3ba24322a139d6556eafb7fbc498e354e3f2c..5b38b123092ccadf5c1d0fab1df842e7d258640a 100644
--- a/src/Tools/tnl-dicom-reader.cpp
+++ b/src/Tools/tnl-dicom-reader.cpp
@@ -38,7 +38,7 @@ bool processDicomFiles( const Config::ParameterContainer& parameters )
 
 bool processDicomSeries( const Config::ParameterContainer& parameters )
 {
-   const List< String >& dicomSeriesNames = parameters.getParameter< List< String > >( "dicom-series" );
+   const Containers::List< String >& dicomSeriesNames = parameters.getParameter< Containers::List< String > >( "dicom-series" );
    String meshFile = parameters.getParameter< String >( "mesh-file" );
    bool verbose = parameters.getParameter< bool >( "verbose" );
 
diff --git a/src/Tools/tnl-diff.cpp b/src/Tools/tnl-diff.cpp
index 3e1155ce722d0c40142e99889addbc26365b6c56..76c2e792bc5e7b2a8f09304ac038334506c7e586 100644
--- a/src/Tools/tnl-diff.cpp
+++ b/src/Tools/tnl-diff.cpp
@@ -15,7 +15,7 @@
 void setupConfig( Config::ConfigDescription& config )
 {
    config.addEntry< String >( "mesh", "Input mesh file.", "mesh.tnl" );
-   config.addRequiredEntry< List< String > >( "input-files", "The first set of the input files." );
+   config.addRequiredEntry< Containers::List< String > >( "input-files", "The first set of the input files." );
    config.addEntry< String >( "output-file", "File for the output data.", "tnl-diff.log" );
    config.addEntry< String >( "mode", "Mode 'couples' compares two subsequent files. Mode 'sequence' compares the input files against the first one. 'halves' compares the files from the and the second half of the intput files.", "couples" );
       config.addEntryEnum< String >( "couples" );
@@ -56,7 +56,7 @@ int main( int argc, char* argv[] )
       return EXIT_FAILURE;
    }
    std::cout << meshType << " detected in " << meshFile << " file." << std::endl;
-   List< String > parsedMeshType;
+   Containers::List< String > parsedMeshType;
    if( ! parseObjectType( meshType, parsedMeshType ) )
    {
       std::cerr << "Unable to parse the mesh type " << meshType << "." << std::endl;
diff --git a/src/Tools/tnl-diff.h b/src/Tools/tnl-diff.h
index 0d6e1a0f7b2012093e90bc03244c88b1c3770468..73036199d0dcb70c3d32ad1fcfae7d1f4e68318e 100644
--- a/src/Tools/tnl-diff.h
+++ b/src/Tools/tnl-diff.h
@@ -24,7 +24,7 @@ template< typename MeshPointer, typename Element, typename Real, typename Index
 bool computeDifferenceOfMeshFunctions( const MeshPointer& meshPointer, const Config::ParameterContainer& parameters )
 {
    bool verbose = parameters. getParameter< bool >( "verbose" );
-   List< String > inputFiles = parameters. getParameter< List< String > >( "input-files" );
+   Containers::List< String > inputFiles = parameters. getParameter< Containers::List< String > >( "input-files" );
    String mode = parameters. getParameter< String >( "mode" );
    String outputFileName = parameters. getParameter< String >( "output-file" );
    double snapshotPeriod = parameters. getParameter< double >( "snapshot-period" );
@@ -161,7 +161,7 @@ template< typename MeshPointer, typename Element, typename Real, typename Index
 bool computeDifferenceOfVectors( const MeshPointer& meshPointer, const Config::ParameterContainer& parameters )
 {
    bool verbose = parameters. getParameter< bool >( "verbose" );
-   List< String > inputFiles = parameters. getParameter< List< String > >( "input-files" );
+   Containers::List< String > inputFiles = parameters. getParameter< Containers::List< String > >( "input-files" );
    String mode = parameters. getParameter< String >( "mode" );
    String outputFileName = parameters. getParameter< String >( "output-file" );
    double snapshotPeriod = parameters. getParameter< double >( "snapshot-period" );
@@ -307,7 +307,7 @@ bool computeDifference( const MeshPointer& meshPointer, const String& objectType
 template< typename MeshPointer, typename Element, typename Real >
 bool setIndexType( const MeshPointer& meshPointer,
                    const String& inputFileName,
-                   const List< String >& parsedObjectType,
+                   const Containers::List< String >& parsedObjectType,
                    const Config::ParameterContainer& parameters )
 {
    String indexType;
@@ -335,8 +335,8 @@ bool setIndexType( const MeshPointer& meshPointer,
 template< typename MeshPointer >
 bool setTupleType( const MeshPointer& meshPointer,
                    const String& inputFileName,
-                   const List< String >& parsedObjectType,
-                   const List< String >& parsedElementType,
+                   const Containers::List< String >& parsedObjectType,
+                   const Containers::List< String >& parsedElementType,
                    const Config::ParameterContainer& parameters )
 {
    int dimensions = atoi( parsedElementType[ 1 ].getString() );
@@ -386,7 +386,7 @@ bool setTupleType( const MeshPointer& meshPointer,
 template< typename MeshPointer >
 bool setElementType( const MeshPointer& meshPointer,
                      const String& inputFileName,
-                     const List< String >& parsedObjectType,
+                     const Containers::List< String >& parsedObjectType,
                      const Config::ParameterContainer& parameters )
 {
    String elementType;
@@ -410,7 +410,7 @@ bool setElementType( const MeshPointer& meshPointer,
       return setIndexType< MeshPointer, double, double >( meshPointer, inputFileName, parsedObjectType, parameters );
    if( elementType == "long double" )
       return setIndexType< MeshPointer, long double, long double >( meshPointer, inputFileName, parsedObjectType, parameters );
-   List< String > parsedElementType;
+   Containers::List< String > parsedElementType;
    if( ! parseObjectType( elementType, parsedElementType ) )
    {
       std::cerr << "Unable to parse object type " << elementType << "." << std::endl;
@@ -427,7 +427,7 @@ template< typename Mesh >
 bool processFiles( const Config::ParameterContainer& parameters )
 {
    int verbose = parameters. getParameter< int >( "verbose");
-   List< String > inputFiles = parameters. getParameter< List< String > >( "input-files" );
+   Containers::List< String > inputFiles = parameters. getParameter< Containers::List< String > >( "input-files" );
    String& inputFile = inputFiles[ 0 ];
 
    /****
@@ -454,7 +454,7 @@ bool processFiles( const Config::ParameterContainer& parameters )
    if( verbose )
      std::cout << objectType << " detected ... ";
 
-   List< String > parsedObjectType;
+   Containers::List< String > parsedObjectType;
    if( ! parseObjectType( objectType, parsedObjectType ) )
    {
       std::cerr << "Unable to parse object type " << objectType << "." << std::endl;
diff --git a/src/Tools/tnl-image-converter.cpp b/src/Tools/tnl-image-converter.cpp
index 3d8f59bc9b6365a375662a739006b1d1ce42570e..0cece9cf7836e410ff55e089568d6f0873f05692 100644
--- a/src/Tools/tnl-image-converter.cpp
+++ b/src/Tools/tnl-image-converter.cpp
@@ -37,7 +37,7 @@ void configSetup( Config::ConfigDescription& config )
 
 bool processImages( const Config::ParameterContainer& parameters )
 {
-    const List< String >& inputImages = parameters.getParameter< List< String > >( "input-images" );
+    const Containers::List< String >& inputImages = parameters.getParameter< Containers::List< String > >( "input-images" );
     String meshFile = parameters.getParameter< String >( "mesh-file" );
     bool verbose = parameters.getParameter< bool >( "verbose" );
  
@@ -133,7 +133,7 @@ bool processImages( const Config::ParameterContainer& parameters )
 
 bool processTNLFiles( const Config::ParameterContainer& parameters )
 {
-   const List< String >& inputFiles = parameters.getParameter< List< String > >( "input-files" );
+   const Containers::List< String >& inputFiles = parameters.getParameter< Containers::List< String > >( "input-files" );
    const String& imageFormat = parameters.getParameter< String >( "image-format" );
    String meshFile = parameters.getParameter< String >( "mesh-file" );
    bool verbose = parameters.getParameter< bool >( "verbose" );
diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp
index e61afd2f627cbfc7f65f59e0f48d304c3167db72..90870bad68daa3761c20c44b2d04e24ad22c5677 100644
--- a/src/Tools/tnl-init.cpp
+++ b/src/Tools/tnl-init.cpp
@@ -61,7 +61,7 @@ int main( int argc, char* argv[] )
       return EXIT_FAILURE;
    }
    std::cout << meshType << " detected in " << meshFile << " file." << std::endl;
-   List< String > parsedMeshType;
+   Containers::List< String > parsedMeshType;
    if( ! parseObjectType( meshType, parsedMeshType ) )
    {
       std::cerr << "Unable to parse the mesh type " << meshType << "." << std::endl;
diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h
index ffe12126468cfca85bb05e4e0e4ca2585c69498c..98f0e461751a955f73a92bee419f29917bc7b3a5 100644
--- a/src/Tools/tnl-init.h
+++ b/src/Tools/tnl-init.h
@@ -201,7 +201,7 @@ bool resolveRealType( const Config::ParameterContainer& parameters )
 
 
 template< int Dimensions, typename RealType, typename IndexType >
-bool resolveMesh( const List< String >& parsedMeshType,
+bool resolveMesh( const Containers::List< String >& parsedMeshType,
                   const Config::ParameterContainer& parameters )
 {
   std::cout << "+ -> Setting mesh type to " << parsedMeshType[ 0 ] << " ... " << std::endl;
@@ -216,7 +216,7 @@ bool resolveMesh( const List< String >& parsedMeshType,
 }
 
 template< int Dimensions, typename RealType >
-bool resolveIndexType( const List< String >& parsedMeshType,
+bool resolveIndexType( const Containers::List< String >& parsedMeshType,
                        const Config::ParameterContainer& parameters )
 {
   std::cout << "+ -> Setting index type to " << parsedMeshType[ 4 ] << " ... " << std::endl;
@@ -230,7 +230,7 @@ bool resolveIndexType( const List< String >& parsedMeshType,
 }
 
 template< int Dimensions >
-bool resolveRealType( const List< String >& parsedMeshType,
+bool resolveRealType( const Containers::List< String >& parsedMeshType,
                       const Config::ParameterContainer& parameters )
 {
   std::cout << "+ -> Setting real type to " << parsedMeshType[ 2 ] << " ... " << std::endl;
@@ -246,7 +246,7 @@ bool resolveRealType( const List< String >& parsedMeshType,
    return false;
 }
 
-bool resolveMeshType( const List< String >& parsedMeshType,
+bool resolveMeshType( const Containers::List< String >& parsedMeshType,
                       const Config::ParameterContainer& parameters )
 {
   std::cout << "+ -> Setting dimensions to " << parsedMeshType[ 1 ] << " ... " << std::endl;
diff --git a/src/Tools/tnl-quickstart/problem.h.in b/src/Tools/tnl-quickstart/problem.h.in
index def92930e372d2a7ad22349f85d9e88f2e47584b..32fcf2f07a2e020b3b25a3761561e4849f9bca50 100644
--- a/src/Tools/tnl-quickstart/problem.h.in
+++ b/src/Tools/tnl-quickstart/problem.h.in
@@ -29,6 +29,7 @@ class {problemBaseName}Problem:
       using typename BaseType::DofVectorType;
       using typename BaseType::DofVectorPointer;
       using typename BaseType::MeshDependentDataType;
+      using typename BaseType::MeshDependentDataPointer;
 
 
       static TNL::String getTypeStatic();
@@ -46,7 +47,7 @@ class {problemBaseName}Problem:
       bool setInitialCondition( const TNL::Config::ParameterContainer& parameters,
                                 const MeshPointer& mesh,
                                 DofVectorPointer& dofs,
-                                MeshDependentDataType& meshDependentData );
+                                MeshDependentDataPointer& meshDependentData );
 
       template< typename MatrixPointer >
       bool setupLinearSystem( const MeshPointer& mesh,
@@ -56,19 +57,19 @@ class {problemBaseName}Problem:
                          const IndexType& step,
                          const MeshPointer& mesh,
                          DofVectorPointer& dofs,
-                         MeshDependentDataType& meshDependentData );
+                         MeshDependentDataPointer& meshDependentData );
 
       IndexType getDofs( const MeshPointer& mesh ) const;
 
       void bindDofs( const MeshPointer& mesh,
                      DofVectorPointer& dofs );
 
-      void getExplicitRHS( const RealType& time,
-                           const RealType& tau,
-                           const MeshPointer& mesh,
-                           DofVectorPointer& _u,
-                           DofVectorPointer& _fu,
-                           MeshDependentDataType& meshDependentData );
+      void getExplicitUpdate( const RealType& time,
+                              const RealType& tau,
+                              const MeshPointer& mesh,
+                              DofVectorPointer& _u,
+                              DofVectorPointer& _fu,
+                              MeshDependentDataPointer& meshDependentData );
 
       template< typename MatrixPointer >
       void assemblyLinearSystem( const RealType& time,
@@ -77,7 +78,7 @@ class {problemBaseName}Problem:
                                  DofVectorPointer& dofs,
                                  MatrixPointer& matrixPointer,
                                  DofVectorPointer& rightHandSide,
-                                 MeshDependentDataType& meshDependentData );
+                                 MeshDependentDataPointer& meshDependentData );
 
    protected:
     
diff --git a/src/Tools/tnl-quickstart/problem_impl.h.in b/src/Tools/tnl-quickstart/problem_impl.h.in
index 0a495676127f6e5a5f85402ebc7c994d19c79c36..18ba3a287ab6d6df79a3d3d594aa8a12147f8c0a 100644
--- a/src/Tools/tnl-quickstart/problem_impl.h.in
+++ b/src/Tools/tnl-quickstart/problem_impl.h.in
@@ -94,7 +94,7 @@ bool
 setInitialCondition( const TNL::Config::ParameterContainer& parameters,    
                      const MeshPointer& mesh,
                      DofVectorPointer& dofs,
-                     MeshDependentDataType& meshDependentData )
+                     MeshDependentDataPointer& meshDependentData )
 {{
    const TNL::String& initialConditionFile = parameters.getParameter< TNL::String >( "initial-condition" );
    TNL::Functions::MeshFunction< Mesh > u( mesh, dofs );
@@ -142,7 +142,7 @@ makeSnapshot( const RealType& time,
               const IndexType& step,
               const MeshPointer& mesh,
               DofVectorPointer& dofs,
-              MeshDependentDataType& meshDependentData )
+              MeshDependentDataPointer& meshDependentData )
 {{
    std::cout << std::endl << "Writing output at time " << time << " step " << step << "." << std::endl;
    this->bindDofs( mesh, dofs );
@@ -161,15 +161,15 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 {problemBaseName}Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::                    
-getExplicitRHS( const RealType& time,
-                const RealType& tau,
-                const MeshPointer& mesh,
-                DofVectorPointer& _u,
-                DofVectorPointer& _fu,
-                MeshDependentDataType& meshDependentData )
+getExplicitUpdate( const RealType& time,
+                   const RealType& tau,
+                   const MeshPointer& mesh,
+                   DofVectorPointer& _u,
+                   DofVectorPointer& _fu,
+                   MeshDependentDataPointer& meshDependentData )
 {{
    /****
-    * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you
+    * If you use an explicit solver like EulerSolver or MersonSolver, you
     * need to implement this method. Compute the right-hand side of
     *
     *   d/dt u(x) = fu( x, u )
@@ -208,7 +208,7 @@ assemblyLinearSystem( const RealType& time,
                       DofVectorPointer& _u,
                       MatrixPointer& matrixPointer,
                       DofVectorPointer& b,
-                      MeshDependentDataType& meshDependentData )
+                      MeshDependentDataPointer& meshDependentData )
 {{
    TNL::Solvers::PDE::LinearSystemAssembler< Mesh,
                              MeshFunctionType,
diff --git a/src/Tools/tnl-view.cpp b/src/Tools/tnl-view.cpp
index f7d974821f6d702b6a23afa6bfe33e82d8d1703a..3f4e19304eda56866e0350e888c728aa8457b87f 100644
--- a/src/Tools/tnl-view.cpp
+++ b/src/Tools/tnl-view.cpp
@@ -71,7 +71,7 @@ int main( int argc, char* argv[] )
       return EXIT_FAILURE;
    }
    std::cout << meshType << " detected in " << meshFile << " file." << std::endl;
-   List< String > parsedMeshType;
+   Containers::List< String > parsedMeshType;
    if( ! parseObjectType( meshType, parsedMeshType ) )
    {
       std::cerr << "Unable to parse the mesh type " << meshType << "." << std::endl;
diff --git a/src/Tools/tnl-view.h b/src/Tools/tnl-view.h
index c396954ae697ed28d15a8849309e16f7bb48c2c6..60c366c4aa581f4cb107d12b1357aef61e966440 100644
--- a/src/Tools/tnl-view.h
+++ b/src/Tools/tnl-view.h
@@ -20,7 +20,6 @@
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Functions/MeshFunction.h>
 
-using namespace std;
 using namespace TNL;
 
 bool getOutputFileName( const String& inputFileName,
@@ -50,6 +49,7 @@ bool writeMeshFunction( const typename MeshFunction::MeshPointer& meshPointer,
                         const Config::ParameterContainer& parameters  )
 {
    MeshFunction function( meshPointer );
+   std::cout << "Mesh function: " << function.getType() << std::endl;
    if( ! function.load( inputFileName ) )
    {
       std::cerr << "Unable to load mesh function from a file " << inputFileName << "." << std::endl;
@@ -83,7 +83,7 @@ template< typename MeshPointer,
           int EntityDimensions >
 bool setMeshEntityType( const MeshPointer& meshPointer,
                         const String& inputFileName,
-                        const List< String >& parsedObjectType,
+                        const Containers::List< String >& parsedObjectType,
                         const Config::ParameterContainer& parameters )
 {
    if( parsedObjectType[ 3 ] == "float" )
@@ -100,7 +100,7 @@ template< typename MeshReal,
           typename MeshIndex >
 bool setMeshEntityDimensions( const SharedPointer< Meshes::Grid< 1, MeshReal, Devices::Host, MeshIndex > >& meshPointer,
                               const String& inputFileName,
-                              const List< String >& parsedObjectType,
+                              const Containers::List< String >& parsedObjectType,
                               const Config::ParameterContainer& parameters )
 {
    typedef Meshes::Grid< 1, MeshReal, Devices::Host, MeshIndex > Mesh;
@@ -124,7 +124,7 @@ template< typename MeshReal,
           typename MeshIndex >
 bool setMeshEntityDimensions( const SharedPointer< Meshes::Grid< 2, MeshReal, Devices::Host, MeshIndex > >& meshPointer,
                               const String& inputFileName,
-                              const List< String >& parsedObjectType,
+                              const Containers::List< String >& parsedObjectType,
                               const Config::ParameterContainer& parameters )
 {
    typedef Meshes::Grid< 2, MeshReal, Devices::Host, MeshIndex > Mesh;
@@ -151,7 +151,7 @@ template< typename MeshReal,
           typename MeshIndex >
 bool setMeshEntityDimensions( const SharedPointer< Meshes::Grid< 3, MeshReal, Devices::Host, MeshIndex > >& meshPointer,
                               const String& inputFileName,
-                              const List< String >& parsedObjectType,
+                              const Containers::List< String >& parsedObjectType,
                               const Config::ParameterContainer& parameters )
 {
    typedef Meshes::Grid< 3, MeshReal, Devices::Host, MeshIndex > Mesh;
@@ -180,7 +180,7 @@ bool setMeshEntityDimensions( const SharedPointer< Meshes::Grid< 3, MeshReal, De
 template< typename MeshPointer >
 bool setMeshFunction( const MeshPointer& meshPointer,
                       const String& inputFileName,
-                      const List< String >& parsedObjectType,
+                      const Containers::List< String >& parsedObjectType,
                       const Config::ParameterContainer& parameters )
 {
    if( parsedObjectType[ 1 ] != meshPointer->getSerializationType() )
@@ -195,7 +195,7 @@ bool setMeshFunction( const MeshPointer& meshPointer,
 template< typename MeshPointer, typename Element, typename Real, typename Index, int Dimensions >
 bool convertObject( const MeshPointer& meshPointer,
                     const String& inputFileName,
-                    const List< String >& parsedObjectType,
+                    const Containers::List< String >& parsedObjectType,
                     const Config::ParameterContainer& parameters )
 {
    int verbose = parameters. getParameter< int >( "verbose");
@@ -213,10 +213,15 @@ bool convertObject( const MeshPointer& meshPointer,
        parsedObjectType[ 0 ] == "tnlSharedVector" ||   // TODO: remove deprecated type names
        parsedObjectType[ 0 ] == "tnlVector" )          //
    {
-      Containers::Vector< Element, Devices::Host, Index > vector;
-      if( ! vector. load( inputFileName ) )
+      using MeshType = typename MeshPointer::ObjectType;
+      // FIXME: why is MeshType::IndexType not the same as Index?
+//      Containers::Vector< Element, Devices::Host, Index > vector;
+      Containers::Vector< Element, Devices::Host, typename MeshType::IndexType > vector;
+      if( ! vector.load( inputFileName ) )
          return false;
-      if( ! meshPointer->write( vector, outputFileName, outputFormat ) )
+      Functions::MeshFunction< MeshType, MeshType::meshDimensions, Element > mf;
+      mf.bind( meshPointer, vector );
+      if( ! mf.write( outputFileName, outputFormat ) )
          return false;
    }
 
@@ -243,7 +248,7 @@ bool convertObject( const MeshPointer& meshPointer,
 template< typename MeshPointer, typename Element, typename Real, typename Index >
 bool setDimensions( const MeshPointer& meshPointer,
                     const String& inputFileName,
-                    const List< String >& parsedObjectType,
+                    const Containers::List< String >& parsedObjectType,
                     const Config::ParameterContainer& parameters )
 {
    int dimensions( 0 );
@@ -271,7 +276,7 @@ bool setDimensions( const MeshPointer& meshPointer,
 template< typename MeshPointer, typename Element, typename Real >
 bool setIndexType( const MeshPointer& meshPointer,
                    const String& inputFileName,
-                   const List< String >& parsedObjectType,
+                   const Containers::List< String >& parsedObjectType,
                    const Config::ParameterContainer& parameters )
 {
    String indexType;
@@ -288,15 +293,15 @@ bool setIndexType( const MeshPointer& meshPointer,
       return setDimensions< MeshPointer, Element, Real, int >( meshPointer, inputFileName, parsedObjectType, parameters );
    if( indexType == "long-int" )
       return setDimensions< MeshPointer, Element, Real, long int >( meshPointer, inputFileName, parsedObjectType, parameters );
-   cerr << "Unknown index type " << indexType << "." << endl;
+   std::cerr << "Unknown index type " << indexType << "." << std::endl;
    return false;
 }
 
 template< typename MeshPointer >
 bool setTupleType( const MeshPointer& meshPointer,
                    const String& inputFileName,
-                   const List< String >& parsedObjectType,
-                   const List< String >& parsedElementType,
+                   const Containers::List< String >& parsedObjectType,
+                   const Containers::List< String >& parsedElementType,
                    const Config::ParameterContainer& parameters )
 {
    int dimensions = atoi( parsedElementType[ 1 ].getString() );
@@ -346,7 +351,7 @@ bool setTupleType( const MeshPointer& meshPointer,
 template< typename MeshPointer >
 bool setElementType( const MeshPointer& meshPointer,
                      const String& inputFileName,
-                     const List< String >& parsedObjectType,
+                     const Containers::List< String >& parsedObjectType,
                      const Config::ParameterContainer& parameters )
 {
    String elementType;
@@ -368,7 +373,7 @@ bool setElementType( const MeshPointer& meshPointer,
       return setIndexType< MeshPointer, double, double >( meshPointer, inputFileName, parsedObjectType, parameters );
    if( elementType == "long double" )
       return setIndexType< MeshPointer, long double, long double >( meshPointer, inputFileName, parsedObjectType, parameters );
-   List< String > parsedElementType;
+   Containers::List< String > parsedElementType;
    if( ! parseObjectType( elementType, parsedElementType ) )
    {
       std::cerr << "Unable to parse object type " << elementType << "." << std::endl;
@@ -400,7 +405,7 @@ bool processFiles( const Config::ParameterContainer& parameters )
    meshPointer->writeMesh( "mesh.asy", "asymptote" );
 
    bool checkOutputFile = parameters. getParameter< bool >( "check-output-file" );
-   List< String > inputFiles = parameters. getParameter< List< String > >( "input-files" );
+   Containers::List< String > inputFiles = parameters. getParameter< Containers::List< String > >( "input-files" );
    bool error( false );
 //#ifdef HAVE_OPENMP
 //#pragma omp parallel for
@@ -434,7 +439,7 @@ bool processFiles( const Config::ParameterContainer& parameters )
          if( verbose )
            std::cout << objectType << " detected ... ";
 
-         List< String > parsedObjectType;
+         Containers::List< String > parsedObjectType;
          if( ! parseObjectType( objectType, parsedObjectType ) )
          {
             std::cerr << "Unable to parse object type " << objectType << "." << std::endl;
diff --git a/src/Tools/tnlcurve2gnuplot.cpp b/src/Tools/tnlcurve2gnuplot.cpp
index baea729a13a36d63fc0d7bd86ac9287e91c35af4..16f7f372f37ea7881506aac6d7894036676a0256 100644
--- a/src/Tools/tnlcurve2gnuplot.cpp
+++ b/src/Tools/tnlcurve2gnuplot.cpp
@@ -36,9 +36,9 @@ int main( int argc, char* argv[] )
       return 1;
    }
 
-   List< String > input_files = parameters. getParameter< List< String > >( "input-files" );
-   List< String > output_files;
-   if( ! parameters. getParameter< List< String > >( "output-files", output_files ) )
+   Containers::List< String > input_files = parameters. getParameter< Containers::List< String > >( "input-files" );
+   Containers::List< String > output_files;
+   if( ! parameters. getParameter< Containers::List< String > >( "output-files", output_files ) )
       std::cout << "No output files were given." << std::endl;
    int output_step( 1 );
    parameters. getParameter< int >( "output-step", output_step );
diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h
index 796f5b6e69f651c4755c55e1b0696550b14ac481..d11489d4d561abaf77acadcbe31870cb5ce63a6f 100644
--- a/src/UnitTests/Containers/ArrayOperationsTest.h
+++ b/src/UnitTests/Containers/ArrayOperationsTest.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Containers/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Devices/Cuda.h>
 
 #ifdef HAVE_GTEST 
@@ -19,6 +19,7 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Containers::Algorithms;
 
 int getTestSize()
 {
diff --git a/src/UnitTests/Containers/VectorOperationsTest.h b/src/UnitTests/Containers/VectorOperationsTest.h
index 6f7d7d46ce3d89b77309c34343af12e60af5b90f..08d18b1704ca0c23ee32928551b55a5feb96c772 100644
--- a/src/UnitTests/Containers/VectorOperationsTest.h
+++ b/src/UnitTests/Containers/VectorOperationsTest.h
@@ -19,6 +19,7 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Containers::Algorithms;
 
 #ifdef HAVE_GTEST
 
diff --git a/src/UnitTests/ListTest.cpp b/src/UnitTests/ListTest.cpp
index d14883f82f0f187000fe9ad17e9a9a01f361c761..6774d29f578e911d42e919d2dd8ff0b9f81a1610 100644
--- a/src/UnitTests/ListTest.cpp
+++ b/src/UnitTests/ListTest.cpp
@@ -12,7 +12,7 @@
 #include "gtest/gtest.h"
 #endif
 
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 
 using namespace TNL;
 
diff --git a/src/UnitTests/ObjectTest.cpp b/src/UnitTests/ObjectTest.cpp
index 557d1239ba2aceca88c65f3f4f253641ebd4a5d8..3c495846d3de7b62e8c0efb5050970b2fc275fdc 100644
--- a/src/UnitTests/ObjectTest.cpp
+++ b/src/UnitTests/ObjectTest.cpp
@@ -11,6 +11,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Object.h>
 #include <TNL/File.h>
+#include <TNL/Containers/Array.h>
 
 #ifdef HAVE_GTEST 
 #include "gtest/gtest.h"
@@ -29,6 +30,77 @@ TEST( ObjectTest, SaveAndLoadTest )
    file.open( "test-file.tnl", tnlReadMode );
    ASSERT_TRUE( testObject.load( file ) );
 }
+
+TEST( ObjectTest, parseObjectTypeTest )
+{
+   Containers::List< String > parsed;
+   Containers::List< String > expected;
+
+   // plain type
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "int", parsed ) );
+   expected.Append( "int" );
+   EXPECT_EQ( parsed, expected );
+
+   // type with space
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "short int", parsed ) );
+   expected.Append( "short int" );
+   EXPECT_EQ( parsed, expected );
+
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "unsigned short int", parsed ) );
+   expected.Append( "unsigned short int" );
+   EXPECT_EQ( parsed, expected );
+
+   // composed type
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "Containers::Vector< double, Devices::Host, int >", parsed ) );
+   expected.Append( "Containers::Vector" );
+   expected.Append( "double" );
+   expected.Append( "Devices::Host" );
+   expected.Append( "int" );
+   EXPECT_EQ( parsed, expected );
+
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "Containers::Vector< Containers::List< String >, Devices::Host, int >", parsed ) );
+   expected.Append( "Containers::Vector" );
+   expected.Append( "Containers::List< String >" );
+   expected.Append( "Devices::Host" );
+   expected.Append( "int" );
+   EXPECT_EQ( parsed, expected );
+
+   // spaces in the template parameter
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "A< short int >", parsed ) );
+   expected.Append( "A" );
+   expected.Append( "short int" );
+   EXPECT_EQ( parsed, expected );
+
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "A< B< short int >, C >", parsed ) );
+   expected.Append( "A" );
+   expected.Append( "B< short int >" );
+   expected.Append( "C" );
+   EXPECT_EQ( parsed, expected );
+
+   // spaces at different places in the template parameter
+   parsed.reset();
+   expected.reset();
+   ASSERT_TRUE( parseObjectType( "A< b , c <E>  ,d>", parsed ) );
+   expected.Append( "A" );
+   expected.Append( "b" );
+   expected.Append( "c <E>" );
+   expected.Append( "d" );
+   EXPECT_EQ( parsed, expected );
+}
 #endif
 
 
diff --git a/src/UnitTests/StringTest.cpp b/src/UnitTests/StringTest.cpp
index 9b76b0f6d450f2c1f71882a844443d15f270faf4..1063d139dc9781af1fafc2f8ecf34e2a6c8401fe 100644
--- a/src/UnitTests/StringTest.cpp
+++ b/src/UnitTests/StringTest.cpp
@@ -106,6 +106,17 @@ TEST( StringTest, AdditionAssignmentOperator )
    ASSERT_EQ( strcmp( string2. getString(), "stringstring2" ), 0 );
 }
 
+TEST( StringTest, strip )
+{
+   EXPECT_EQ( String( "string" ).strip(), String( "string" ) );
+   EXPECT_EQ( String( "  string" ).strip(), String( "string" ) );
+   EXPECT_EQ( String( "string  " ).strip(), String( "string" ) );
+   EXPECT_EQ( String( "  string  " ).strip(), String( "string" ) );
+   EXPECT_EQ( String( " string1  string2  " ).strip(), String( "string1  string2" ) );
+   EXPECT_EQ( String( "" ).strip(), String( "" ) );
+   EXPECT_EQ( String( "  " ).strip(), String( "" ) );
+}
+
 
 TEST( StringTest, SaveLoad )
 {
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
index 39a5ff2d40750a1b4ba731815e9b634728b61cb9..04f74a2b54a8710b73d67f1a0a9946aa6ad5205e 100755
--- a/tests/benchmarks/CMakeLists.txt
+++ b/tests/benchmarks/CMakeLists.txt
@@ -2,35 +2,30 @@ ADD_SUBDIRECTORY( share )
 ADD_SUBDIRECTORY( heat-equation-benchmark )
 
 IF( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cu )
     if( HAVE_CUBLAS STREQUAL "yes" )
-        CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} )
+        CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas${debugExt} )
     endif()
-    TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
-    
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
+
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
-    
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
+
     CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
 ELSE()
+    ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cpp )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} )
+
     ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
     TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )
 
-    ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cpp )    
+    ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cpp )
     TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
 ENDIF()
 
-if( BUILD_CUDA )                                                              
-   INSTALL( TARGETS
-                tnl-cuda-benchmarks${debugExt}
-            RUNTIME DESTINATION bin )
-endif()
-
-INSTALL( TARGETS 
+INSTALL( TARGETS
+            tnl-benchmark-blas${debugExt}
             tnl-benchmark-spmv${debugExt}
-            tnl-benchmark-linear-solvers${debugExt}                 
+            tnl-benchmark-linear-solvers${debugExt}
          RUNTIME DESTINATION bin )
-
-
-                                            
diff --git a/tests/benchmarks/array-operations.h b/tests/benchmarks/array-operations.h
index 890f9ddc490cec0dee85c9c5b7e607f467b2b8bc..a7c1513e47f122adbc7fd334c2535b4d9cb75777 100644
--- a/tests/benchmarks/array-operations.h
+++ b/tests/benchmarks/array-operations.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          array-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include "benchmarks.h"
@@ -14,7 +26,7 @@ template< typename Real = double,
 bool
 benchmarkArrayOperations( Benchmark & benchmark,
                           const int & loops,
-                          const int & size )
+                          const long & size )
 {
     typedef Containers::Array< Real, Devices::Host, Index > HostArray;
     typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
@@ -25,9 +37,14 @@ benchmarkArrayOperations( Benchmark & benchmark,
     HostArray hostArray, hostArray2;
     CudaArray deviceArray, deviceArray2;
     if( ! hostArray.setSize( size ) ||
-        ! hostArray2.setSize( size ) ||
+        ! hostArray2.setSize( size )
+#ifdef HAVE_CUDA
+        ||
         ! deviceArray.setSize( size ) ||
-        ! deviceArray2.setSize( size ) )
+        ! deviceArray2.setSize( size )
+#endif
+    )
+
     {
         const char* msg = "error: allocation of arrays failed";
         std::cerr << msg << std::endl;
@@ -41,11 +58,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
     // reset functions
     auto reset1 = [&]() {
         hostArray.setValue( 1.0 );
+#ifdef HAVE_CUDA
         deviceArray.setValue( 1.0 );
+#endif
     };
     auto reset2 = [&]() {
         hostArray2.setValue( 1.0 );
+#ifdef HAVE_CUDA
         deviceArray2.setValue( 1.0 );
+#endif
     };
     auto reset12 = [&]() {
         reset1();
@@ -63,9 +84,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
         resultDevice = (int) deviceArray == deviceArray2;
     };
     benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
-    benchmark.time( reset1,
-                    "CPU", compareHost,
-                    "GPU", compareCuda );
+    benchmark.time( reset1, "CPU", compareHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", compareCuda );
+#endif
 
 
     auto copyAssignHostHost = [&]() {
@@ -75,9 +97,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
         deviceArray = deviceArray2;
     };
     benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
-    double basetime = benchmark.time( reset1,
-                    "CPU", copyAssignHostHost,
-                    "GPU", copyAssignCudaCuda );
+    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
+    // complain when compiling without CUDA
+    const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", copyAssignCudaCuda );
+#endif
 
 
     auto copyAssignHostCuda = [&]() {
@@ -86,10 +111,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
     auto copyAssignCudaHost = [&]() {
         hostArray = deviceArray;
     };
-    benchmark.setOperation( "copy (operator=)", datasetSize, basetime );
+#ifdef HAVE_CUDA
+    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
     benchmark.time( reset1,
                     "CPU->GPU", copyAssignHostCuda,
                     "GPU->CPU", copyAssignCudaHost );
+#endif
 
 
     auto setValueHost = [&]() {
@@ -99,9 +126,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
         deviceArray.setValue( 3.0 );
     };
     benchmark.setOperation( "setValue", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", setValueHost,
-                    "GPU", setValueCuda );
+    benchmark.time( reset1, "CPU", setValueHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", setValueCuda );
+#endif
 
 
     auto setSizeHost = [&]() {
@@ -112,12 +140,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
     };
     auto resetSize1 = [&]() {
         hostArray.reset();
+#ifdef HAVE_CUDA
         deviceArray.reset();
+#endif
     };
     benchmark.setOperation( "allocation (setSize)", datasetSize );
-    benchmark.time( resetSize1,
-                    "CPU", setSizeHost,
-                    "GPU", setSizeCuda );
+    benchmark.time( resetSize1, "CPU", setSizeHost );
+#ifdef HAVE_CUDA
+    benchmark.time( resetSize1, "GPU", setSizeCuda );
+#endif
 
 
     auto resetSizeHost = [&]() {
@@ -128,12 +159,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
     };
     auto setSize1 = [&]() {
         hostArray.setSize( size );
+#ifdef HAVE_CUDA
         deviceArray.setSize( size );
+#endif
     };
     benchmark.setOperation( "deallocation (reset)", datasetSize );
-    benchmark.time( setSize1,
-                    "CPU", resetSizeHost,
-                    "GPU", resetSizeCuda );
+    benchmark.time( setSize1, "CPU", resetSizeHost );
+#ifdef HAVE_CUDA
+    benchmark.time( setSize1, "GPU", resetSizeCuda );
+#endif
 
     return true;
 }
diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h
index 20f3d042f8c676bb4b45ecad131bdcad023c162e..ce5e631a6899170cfeba58911be71e5cc17eb7e6 100644
--- a/tests/benchmarks/benchmarks.h
+++ b/tests/benchmarks/benchmarks.h
@@ -1,3 +1,15 @@
+/***************************************************************************
+                          benchmarks.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include <iostream>
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
index 52b26fb860c6f4f7defce8921095d937c4d3b127..25d6eb81ba931c38b222a81ff120b78d3ff07b6d 100644
--- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
+++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h
@@ -70,7 +70,7 @@ class HeatEquationBenchmarkProblem:
       void bindDofs( const MeshPointer& meshPointer,
                      DofVectorPointer& dofsPointer );
 
-      void getExplicitRHS( const RealType& time,
+      void getExplicitUpdate( const RealType& time,
                            const RealType& tau,
                            const MeshPointer& meshPointer,
                            DofVectorPointer& _uPointer,
diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
index e9e3b5af463addbc9b1411ce779f8616a64c0905..0d457445bd8e172a80852d25e74bf185aa8b087e 100644
--- a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
+++ b/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h
@@ -354,15 +354,15 @@ heatEquationTemplatedCompact( const GridType* grid,
    {
       GridEntity entity( *grid, coordinates, entityOrientation, entityBasis );
       
-      //entity.refresh();
-      /*if( ! entity.isBoundaryEntity() )
+      entity.refresh();
+      if( ! entity.isBoundaryEntity() )
       {
          fu( entity ) = 
             ( *differentialOperator )( u, entity, time );
 
          typedef Functions::FunctionAdapter< GridType, RightHandSide > FunctionAdapter;
          fu( entity ) +=  FunctionAdapter::getValue( *rightHandSide, entity, time );
-      }*/
+      }
    }
 }
 #endif
@@ -375,7 +375,7 @@ template< typename Mesh,
           typename DifferentialOperator >
 void
 HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
-getExplicitRHS( const RealType& time,
+getExplicitUpdate( const RealType& time,
                 const RealType& tau,
                 const MeshPointer& mesh,
                 DofVectorPointer& uDofs,
diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
index 8de3a739c094b1935ec88d1ff14d18307da77218..3097d652f0a9928d791854c1eacb2728790dba6e 100644
--- a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
+++ b/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h
@@ -52,12 +52,12 @@ __global__ void boundaryConditionsKernel( Real* u,
       aux[ j * gridXSize + gridYSize - 1 ] = 0.0;
       u[ j * gridXSize + gridYSize - 1 ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
    }
-   if( j == 0 && i > 0 && i < gridXSize - 1 )
+   if( j == 0 && i < gridXSize )
    {
       aux[ i ] = 0.0; //u[ j * gridXSize + 1 ];
       u[ i ] = 0.0; //u[ j * gridXSize + 1 ];
    }
-   if( j == gridYSize -1  && i > 0 && i < gridXSize - 1 )
+   if( j == gridYSize -1  && i < gridXSize )
    {
       aux[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
       u[ j * gridXSize + i ] = 0.0; //u[ j * gridXSize + gridXSize - 1 ];      
@@ -80,9 +80,11 @@ __global__ void heatEquationKernel( const Real* u,
        j > 0 && j < gridYSize - 1 )
    {
       const Index c = j * gridXSize + i;
-      aux[ c ] = tau * ( ( u[ c - 1 ] - 2.0 * u[ c ] + u[ c + 1 ] ) * hx_inv +
-                       ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv );
-   }
+      aux[ c ] = ( ( u[ c - 1 ]         - 2.0 * u[ c ] + u[ c + 1 ]         ) * hx_inv +
+                   ( u[ c - gridXSize ] - 2.0 * u[ c ] + u[ c + gridXSize ] ) * hy_inv );
+      //aux[ c ] = ( ( __ldg( &u[ c - 1 ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + 1 ] ) ) * hx_inv +
+      //                   ( __ldg( &u[ c - gridXSize ] ) - 2.0 * __ldg( &u[ c ] ) + __ldg( &u[ c + gridXSize ] ) ) * hy_inv );
+   }  
 }
 
 template< typename RealType >
diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
index 6834ed731da5afeec97f485b4723d176fe741527..a53770568e4a0305e3cb0279c3f307e4a9139e69 100644
--- a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
+++ b/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h
@@ -190,20 +190,27 @@ template< typename Real, typename Index >
 __global__ void updateKernel( Real* u,
                               Real* aux,
                               Real* cudaBlockResidue,
-                              const Index dofs )
+                              const Index dofs,
+                              Real tau )
 {
+   extern __shared__ Real du[];
    const Index blockOffset = blockIdx.x * blockDim.x;
    Index idx = blockOffset + threadIdx.x;
  
    if( idx < dofs )
-      u[ idx ] += aux[ idx ];
+   {
+      u[ idx ] += tau * aux[ idx ];
+      du[ threadIdx.x ] = fabs( aux[ idx ] );
+   }
+   else
+      du[ threadIdx.x ] = 0.0;
  
    __syncthreads();
 
    const Index rest = dofs - blockOffset;
    Index n =  rest < blockDim.x ? rest : blockDim.x;
 
-   computeBlockResidue< Real, Index >( aux,
+   computeBlockResidue< Real, Index >( du,
                                        cudaBlockResidue,
                                        n );
 }
@@ -346,29 +353,31 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
       const Real timeLeft = finalTime - time;
       const Real currentTau = tau < timeLeft ? tau : timeLeft;    
       
-      if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, tau, hx_inv, hy_inv, gridXSize, gridYSize) )
+      if( ! pureCRhsCuda( cudaGridSize, cudaBlockSize, cuda_u, cuda_aux, currentTau, hx_inv, hy_inv, gridXSize, gridYSize) )
          return false;
       computationTimer.stop();
       
-      /*cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
-      writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
-      getchar();*/
-        
+      /*if( iteration % 100 == 0 )
+      {
+         cudaMemcpy( aux, cuda_aux, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
+         writeFunction( "rhs", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
+
+         cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
+         writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
+         getchar();
+      }*/      
+      
       updateTimer.start();
       /****
        * Update
        */
       //cout << "Update ... " << std::endl;
-      updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount );
+      updateKernel<<< cudaUpdateBlocks, cudaUpdateBlockSize, cudaUpdateBlockSize.x * sizeof( Real ) >>>( cuda_u, cuda_aux, cuda_max_du, dofsCount, tau );
       if( cudaGetLastError() != cudaSuccess )
       {
          std::cerr << "Update failed." << std::endl;
          return false;
-      }
-      /*cudaMemcpy( aux, cuda_u, dofsCount * sizeof( Real ),  cudaMemcpyDeviceToHost );
-      writeFunction( "u", aux, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
-      getchar();*/
-      
+      }            
       
       cudaThreadSynchronize();
       cudaMemcpy( max_du, cuda_max_du, cudaUpdateBlocks.x * sizeof( Real ), cudaMemcpyDeviceToHost );
@@ -391,12 +400,18 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
          cout << "Iteration: " << iteration << "\t Time:" << time << "    \r" << flush;                 
    }
    timer.stop();
+   if( verbose )
+     cout << endl;
+   
    //cudaMemcpy( u, cuda_u, dofsCount * sizeof( Real ), cudaMemcpyDeviceToHost );
    //writeFunction( "final", u, gridXSize, gridYSize, hx, hy, domainXSize / 2.0, domainYSize / 2.0 );
 
    /****
     * Saving the result
     */
+   if( verbose )
+     std::cout << "Saving result..." << std::endl;
+   
    meshFunction.save( "simple-heat-equation-result.tnl" );
    
    /***
@@ -535,6 +550,9 @@ bool solveHeatEquationHost( const Config::ParameterContainer& parameters,
         std::cout << "Iteration: " << iteration << "\t \t Time:" << time << "    \r" << std::flush;
    }
    timer.stop();
+   if( verbose )
+     cout << endl;
+
    
    /****
     * Saving the result
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h b/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h
index ba6774b30c902b5b3c720742f79a5046b4876f64..c4d33c9b06b9942b35cd04c0c4ff069e1050c44a 100644
--- a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h
+++ b/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h
@@ -189,7 +189,7 @@ class Meshes::Grid< 2, Real, Device, Index > : public tnlObject
 
 #include <fstream>
 #include <iomanip>
-#include <core/tnlAssert.h>
+#include <core/tnlTNL_ASSERT.h>
 #include <mesh/GnuplotWriter.h>
 #include <mesh/grids/GridEntityGetter_impl.h>
 #include <mesh/grids/NeighbourGridEntityGetter2D_impl.h>
@@ -310,8 +310,8 @@ template< typename Real,
           typename Index >
 void Meshes::Grid< 2, Real, Device, Index > :: setDimensions( const Index xSize, const Index ySize )
 {
-   tnlAssert( xSize > 0, cerr << "xSize = " << xSize );
-   tnlAssert( ySize > 0, cerr << "ySize = " << ySize );
+   tnlTNL_ASSERT( xSize > 0, cerr << "xSize = " << xSize );
+   tnlTNL_ASSERT( ySize > 0, cerr << "ySize = " << ySize );
 
    this->dimensions.x() = xSize;
    this->dimensions.y() = ySize;
@@ -470,9 +470,9 @@ const Real&
 Meshes::Grid< 2, Real, Device, Index >::
 getSpaceStepsProducts() const
 {
-   tnlAssert( xPow >= -2 && xPow <= 2, 
+   tnlTNL_ASSERT( xPow >= -2 && xPow <= 2, 
               cerr << " xPow = " << xPow );
-   tnlAssert( yPow >= -2 && yPow <= 2, 
+   tnlTNL_ASSERT( yPow >= -2 && yPow <= 2, 
               cerr << " yPow = " << yPow );
 
    return this->spaceStepsProducts[ yPow + 2 ][ xPow + 2 ];
@@ -1075,8 +1075,8 @@ template< typename Real,
           typename Index >
 void Meshes::Grid< 2, Real, Device, Index > :: setDimensions( const Index xSize, const Index ySize )
 {
-   tnlAssert( xSize > 0, cerr << "xSize = " << xSize );
-   tnlAssert( ySize > 0, cerr << "ySize = " << ySize );
+   tnlTNL_ASSERT( xSize > 0, cerr << "xSize = " << xSize );
+   tnlTNL_ASSERT( ySize > 0, cerr << "ySize = " << ySize );
 
    this->dimensions.x() = xSize;
    this->dimensions.y() = ySize;
@@ -1235,9 +1235,9 @@ const Real&
 Meshes::Grid< 2, Real, Device, Index >::
 getSpaceStepsProducts() const
 {
-   tnlAssert( xPow >= -2 && xPow <= 2, 
+   tnlTNL_ASSERT( xPow >= -2 && xPow <= 2, 
               cerr << " xPow = " << xPow );
-   tnlAssert( yPow >= -2 && yPow <= 2, 
+   tnlTNL_ASSERT( yPow >= -2 && yPow <= 2, 
               cerr << " yPow = " << yPow );
 
    return this->spaceStepsProducts[ yPow + 2 ][ xPow + 2 ];
diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h b/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h
index 29635a441122b9e457f60e8207ba857282d7e56b..946be827f9c760d315866a10059fc3f2afa442f4 100644
--- a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h
+++ b/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h
@@ -17,7 +17,7 @@
 
 #pragma once 
 
-#include <core/tnlAssert.h>
+#include <core/tnlTNL_ASSERT.h>
 
 
 template< typename GridEntity,
@@ -33,14 +33,14 @@ class tnlTestNeighbourGridEntityGetter
       __cuda_callable__
       tnlTestNeighbourGridEntityGetter( const GridEntity& entity )
       {
-         //tnlAssert( false, );
+         //tnlTNL_ASSERT( false, );
       };
       
       __cuda_callable__
       void refresh( const typename GridEntity::GridType& grid,
                     const typename GridEntity::IndexType& entityIndex )
       {
-         //tnlAssert( false, );
+         //tnlTNL_ASSERT( false, );
       };
 
 };
diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h
index a197a15a73a0df9d2096dd6fc866b0a2b51e7036..126a3d6f57d2f321d3338f7dfc44a71341204ab0 100644
--- a/tests/benchmarks/spmv.h
+++ b/tests/benchmarks/spmv.h
@@ -1,8 +1,20 @@
+/***************************************************************************
+                          spmv.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include "benchmarks.h"
 
-#include <TNL/List.h>
+#include <TNL/Containers/List.h>
 #include <TNL/Matrices/CSR.h>
 #include <TNL/Matrices/Ellpack.h>
 #include <TNL/Matrices/SlicedEllpack.h>
@@ -102,18 +114,22 @@ benchmarkSpMV( Benchmark & benchmark,
     CudaVector deviceVector, deviceVector2;
 
     // create benchmark group
-    List< String > parsedType;
+    Containers::List< String > parsedType;
     parseObjectType( HostMatrix::getType(), parsedType );
     benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
 
     if( ! hostRowLengths.setSize( size ) ||
-        ! deviceRowLengths.setSize( size ) ||
         ! hostMatrix.setDimensions( size, size ) ||
-        ! deviceMatrix.setDimensions( size, size ) ||
         ! hostVector.setSize( size ) ||
-        ! hostVector2.setSize( size ) ||
+        ! hostVector2.setSize( size )
+#ifdef HAVE_CUDA
+        ||
+        ! deviceRowLengths.setSize( size ) ||
+        ! deviceMatrix.setDimensions( size, size ) ||
         ! deviceVector.setSize( size ) ||
-        ! deviceVector2.setSize( size ) )
+        ! deviceVector2.setSize( size )
+#endif
+        )
     {
         const char* msg = "error: allocation of vectors failed";
         std::cerr << msg << std::endl;
@@ -122,7 +138,9 @@ benchmarkSpMV( Benchmark & benchmark,
     }
 
     hostRowLengths.setValue( elementsPerRow );
+#ifdef HAVE_CUDA
     deviceRowLengths.setValue( elementsPerRow );
+#endif
 
     if( ! hostMatrix.setCompressedRowsLengths( hostRowLengths ) ) {
         const char* msg = "error: allocation of host matrix failed";
@@ -130,12 +148,14 @@ benchmarkSpMV( Benchmark & benchmark,
         benchmark.addErrorMessage( msg, 2 );
         return false;
     }
+#ifdef HAVE_CUDA
     if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) {
         const char* msg = "error: allocation of device matrix failed";
         std::cerr << msg << std::endl;
         benchmark.addErrorMessage( msg, 2 );
         return false;
     }
+#endif
 
     const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
     setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
@@ -144,9 +164,11 @@ benchmarkSpMV( Benchmark & benchmark,
     // reset function
     auto reset = [&]() {
         hostVector.setValue( 1.0 );
-        deviceVector.setValue( 1.0 );
         hostVector2.setValue( 0.0 );
+#ifdef HAVE_CUDA
+        deviceVector.setValue( 1.0 );
         deviceVector2.setValue( 0.0 );
+#endif
     };
 
     // compute functions
@@ -158,9 +180,10 @@ benchmarkSpMV( Benchmark & benchmark,
     };
 
     benchmark.setOperation( datasetSize );
-    benchmark.time( reset,
-                    "CPU", spmvHost,
-                    "GPU", spmvCuda );
+    benchmark.time( reset, "CPU", spmvHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset, "GPU", spmvCuda );
+#endif
 
     return true;
 }
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.cu b/tests/benchmarks/tnl-benchmark-blas.cpp
similarity index 80%
rename from tests/benchmarks/tnl-cuda-benchmarks.cu
rename to tests/benchmarks/tnl-benchmark-blas.cpp
index b92e8d52a59f5ad62714464917b8dbdb19624d7e..76ec0448a74f84d76e78ef6237cc44736d015abf 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.cu
+++ b/tests/benchmarks/tnl-benchmark-blas.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnl-cuda-benchmarks.cu  -  description
+                          tnl-benchmark-blas.cpp  -  description
                              -------------------
     begin                : May 28, 2015
     copyright            : (C) 2015 by Tomas Oberhuber
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "tnl-cuda-benchmarks.h"
+#include "tnl-benchmark-blas.h"
diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.cpp b/tests/benchmarks/tnl-benchmark-blas.cu
similarity index 61%
rename from src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.cpp
rename to tests/benchmarks/tnl-benchmark-blas.cu
index ebce329454f166df941c143237f173d801ffe697..f35d5a5f5b6de701687adb51b5bd52e8b6b0c1f5 100644
--- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.cpp
+++ b/tests/benchmarks/tnl-benchmark-blas.cu
@@ -1,13 +1,11 @@
 /***************************************************************************
-                          cuda-prefix-sum_impl.cpp  -  description
+                          tnl-benchmark-blas.cu  -  description
                              -------------------
-    begin                : Jan 18, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
+    begin                : May 28, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-
-
-
+#include "tnl-benchmark-blas.h"
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-benchmark-blas.h
similarity index 73%
rename from tests/benchmarks/tnl-cuda-benchmarks.h
rename to tests/benchmarks/tnl-benchmark-blas.h
index 1b0aba5303b3eb8f7dd7fd16f0f054c12421332b..ecf9b08c65d8c36b47d51dff4c062b1c45cd8b45 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.h
+++ b/tests/benchmarks/tnl-benchmark-blas.h
@@ -1,17 +1,18 @@
 /***************************************************************************
-                          tnl-benchmarks.h  -  description
+                          tnl-benchmark-blas.h  -  description
                              -------------------
     begin                : Jan 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
+    copyright            : (C) 2010 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
-#ifndef tnlCudaBENCHMARKS_H_
-#define TNLCUDBENCHMARKS_H_
+// Implemented by: Jakub Klinkovsky
 
-#include <TNL/SystemInfo.h>
+#pragma once
+
+#include <TNL/Devices/Host.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
@@ -29,10 +30,10 @@ using namespace TNL::benchmarks;
 
 template< typename Real >
 void
-runCudaBenchmarks( Benchmark & benchmark,
+runBlasBenchmarks( Benchmark & benchmark,
                    Benchmark::MetadataMap metadata,
-                   const unsigned & minSize,
-                   const unsigned & maxSize,
+                   const std::size_t & minSize,
+                   const std::size_t & maxSize,
                    const double & sizeStepFactor,
                    const unsigned & loops,
                    const unsigned & elementsPerRow )
@@ -43,7 +44,7 @@ runCudaBenchmarks( Benchmark & benchmark,
     // Array operations
     benchmark.newBenchmark( String("Array operations (") + precision + ")",
                             metadata );
-    for( unsigned size = minSize; size <= maxSize; size *= 2 ) {
+    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
         benchmark.setMetadataColumns( Benchmark::MetadataColumns({
            {"size", size},
         } ));
@@ -53,7 +54,7 @@ runCudaBenchmarks( Benchmark & benchmark,
     // Vector operations
     benchmark.newBenchmark( String("Vector operations (") + precision + ")",
                             metadata );
-    for( unsigned size = minSize; size <= maxSize; size *= sizeStepFactor ) {
+    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
         benchmark.setMetadataColumns( Benchmark::MetadataColumns({
            {"size", size},
         } ));
@@ -63,7 +64,7 @@ runCudaBenchmarks( Benchmark & benchmark,
     // Sparse matrix-vector multiplication
     benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                             metadata );
-    for( unsigned size = minSize; size <= maxSize; size *= 2 ) {
+    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
         benchmark.setMetadataColumns( Benchmark::MetadataColumns({
             {"rows", size},
             {"columns", size},
@@ -77,7 +78,7 @@ void
 setupConfig( Config::ConfigDescription & config )
 {
     config.addDelimiter( "Benchmark settings:" );
-    config.addEntry< String >( "log-file", "Log file name.", "tnl-cuda-benchmarks.log");
+    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
     config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
     config.addEntryEnum( "append" );
     config.addEntryEnum( "overwrite" );
@@ -91,12 +92,15 @@ setupConfig( Config::ConfigDescription & config )
     config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
     config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 );
     config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+    config.addDelimiter( "Device settings:" );
+    Devices::Host::configSetup( config );
+    Devices::Cuda::configSetup( config );
 }
 
 int
 main( int argc, char* argv[] )
 {
-#ifdef HAVE_CUDA
     Config::ParameterContainer parameters;
     Config::ConfigDescription conf_desc;
 
@@ -107,11 +111,19 @@ main( int argc, char* argv[] )
         return 1;
     }
 
+    Devices::Host::setup( parameters );
+    Devices::Cuda::setup( parameters );
+
     const String & logFileName = parameters.getParameter< String >( "log-file" );
     const String & outputMode = parameters.getParameter< String >( "output-mode" );
     const String & precision = parameters.getParameter< String >( "precision" );
-    const unsigned minSize = parameters.getParameter< unsigned >( "min-size" );
-    const unsigned maxSize = parameters.getParameter< unsigned >( "max-size" );
+    // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(),
+    // which have a default value. The workaround below works for int values, but it is not possible
+    // to pass 64-bit integer values
+//    const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" );
+//    const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" );
+    const std::size_t minSize = parameters.getParameter< int >( "min-size" );
+    const std::size_t maxSize = parameters.getParameter< int >( "max-size" );
     const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
     const unsigned loops = parameters.getParameter< unsigned >( "loops" );
     const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" );
@@ -132,27 +144,29 @@ main( int argc, char* argv[] )
     Benchmark benchmark( loops, verbose );
 
     // prepare global metadata
-    SystemInfo systemInfo;
     const int cpu_id = 0;
-    tnlCacheSizes cacheSizes = systemInfo.getCPUCacheSizes( cpu_id );
+    Devices::CacheSizes cacheSizes = Devices::Host::getCPUCacheSizes( cpu_id );
     String cacheInfo = String( cacheSizes.L1data ) + ", "
                         + String( cacheSizes.L1instruction ) + ", "
                         + String( cacheSizes.L2 ) + ", "
                         + String( cacheSizes.L3 );
+#ifdef HAVE_CUDA
     const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
     const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
-                                 String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+                              String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+#endif
     Benchmark::MetadataMap metadata {
-        { "host name", systemInfo.getHostname() },
-        { "architecture", systemInfo.getArchitecture() },
-        { "system", systemInfo.getSystemName() },
-        { "system release", systemInfo.getSystemRelease() },
-        { "start time", systemInfo.getCurrentTime() },
-        { "CPU model name", systemInfo.getCPUModelName( cpu_id ) },
-        { "CPU cores", systemInfo.getNumberOfCores( cpu_id ) },
-        { "CPU threads per core", systemInfo.getNumberOfThreads( cpu_id ) / systemInfo.getNumberOfCores( cpu_id ) },
-        { "CPU max frequency (MHz)", systemInfo.getCPUMaxFrequency( cpu_id ) / 1e3 },
+        { "host name", Devices::Host::getHostname() },
+        { "architecture", Devices::Host::getArchitecture() },
+        { "system", Devices::Host::getSystemName() },
+        { "system release", Devices::Host::getSystemRelease() },
+        { "start time", Devices::Host::getCurrentTime() },
+        { "CPU model name", Devices::Host::getCPUModelName( cpu_id ) },
+        { "CPU cores", Devices::Host::getNumberOfCores( cpu_id ) },
+        { "CPU threads per core", Devices::Host::getNumberOfThreads( cpu_id ) / Devices::Host::getNumberOfCores( cpu_id ) },
+        { "CPU max frequency (MHz)", Devices::Host::getCPUMaxFrequency( cpu_id ) / 1e3 },
         { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
+#ifdef HAVE_CUDA
         { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
         { "GPU architecture", deviceArch },
         { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) },
@@ -160,12 +174,13 @@ main( int argc, char* argv[] )
         { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 },
         { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 },
         { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) },
+#endif
     };
 
     if( precision == "all" || precision == "float" )
-        runCudaBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+        runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
     if( precision == "all" || precision == "double" )
-        runCudaBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
+        runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow );
 
     if( ! benchmark.save( logFile ) ) {
         std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
@@ -173,10 +188,4 @@ main( int argc, char* argv[] )
     }
 
     return EXIT_SUCCESS;
-#else
-    CudaSupportMissingMessage;
-    return EXIT_FAILURE;
-#endif
 }
-
-#endif /* Devices::CudaBENCHMARKS_H_ */
diff --git a/tests/benchmarks/tnlCusparseCSRMatrix.h b/tests/benchmarks/tnlCusparseCSRMatrix.h
index 85a67386cff105b3af75966bf74ada58d79d59d1..08192c70fceff0739ce75678595dcc86b49e286b 100644
--- a/tests/benchmarks/tnlCusparseCSRMatrix.h
+++ b/tests/benchmarks/tnlCusparseCSRMatrix.h
@@ -60,7 +60,7 @@ class tnlCusparseCSRBase
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         Assert( matrix, );
+         TNL_ASSERT( matrix, );
 #ifdef HAVE_CUDA
          cusparseDcsrmv( *( this->cusparseHandle ),
                          CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -103,7 +103,7 @@ class tnlCusparseCSR< double > : public tnlCusparseCSRBase< double >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         Assert( matrix, );
+         TNL_ASSERT( matrix, );
 #ifdef HAVE_CUDA
          /*cusparseDcsrmv( *( this->cusparseHandle ),
                          CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -131,7 +131,7 @@ class tnlCusparseCSR< float > : public tnlCusparseCSRBase< float >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const
       {
-         Assert( matrix, );
+         TNL_ASSERT( matrix, );
 #ifdef HAVE_CUDA
          /*cusparseScsrmv( *( this->cusparseHandle ),
                          CUSPARSE_OPERATION_NON_TRANSPOSE,
diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h
index 4d0a7e00971d3aae415917888222c586fed48912..1c663ac20de0ecb1bfe20300787a43303b1f65bc 100644
--- a/tests/benchmarks/vector-operations.h
+++ b/tests/benchmarks/vector-operations.h
@@ -1,5 +1,19 @@
+/***************************************************************************
+                          vector-operations.h  -  description
+                             -------------------
+    begin                : Dec 30, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
+#include <stdlib.h> // srand48
+
 #include "benchmarks.h"
 
 #include <TNL/Containers/Vector.h>
@@ -18,7 +32,7 @@ template< typename Real = double,
 bool
 benchmarkVectorOperations( Benchmark & benchmark,
                            const int & loops,
-                           const int & size )
+                           const long & size )
 {
     typedef Containers::Vector< Real, Devices::Host, Index > HostVector;
     typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector;
@@ -29,9 +43,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     HostVector hostVector, hostVector2;
     CudaVector deviceVector, deviceVector2;
     if( ! hostVector.setSize( size ) ||
-        ! hostVector2.setSize( size ) ||
+        ! hostVector2.setSize( size )
+#ifdef HAVE_CUDA
+        ||
         ! deviceVector.setSize( size ) ||
-        ! deviceVector2.setSize( size ) )
+        ! deviceVector2.setSize( size )
+#endif
+        )
     {
         const char* msg = "error: allocation of vectors failed";
         std::cerr << msg << std::endl;
@@ -53,12 +71,19 @@ benchmarkVectorOperations( Benchmark & benchmark,
     // of the benchmark loop.)
     auto reset1 = [&]() {
         hostVector.setValue( 1.0 );
+#ifdef HAVE_CUDA
         deviceVector.setValue( 1.0 );
+#endif
+        // A relatively harmless call to keep the compiler from realizing we
+        // don't actually do any useful work with the result of the reduciton.
+        srand48(resultHost);
         resultHost = resultDevice = 0.0;
     };
     auto reset2 = [&]() {
         hostVector2.setValue( 1.0 );
+#ifdef HAVE_CUDA
         deviceVector2.setValue( 1.0 );
+#endif
     };
     auto reset12 = [&]() {
         reset1();
@@ -84,12 +109,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-    benchmark.time( reset1,
-                    "CPU", multiplyHost,
-                    "GPU", multiplyCuda );
+    benchmark.time( reset1, "CPU", multiplyHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", multiplyCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", multiplyCublas );
 #endif
+#endif
 
 
     auto addVectorHost = [&]() {
@@ -108,12 +134,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "vector addition", 3 * datasetSize );
-    benchmark.time( reset1,
-                    "CPU", addVectorHost,
-                    "GPU", addVectorCuda );
+    benchmark.time( reset1, "CPU", addVectorHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", addVectorCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", addVectorCublas );
 #endif
+#endif
 
 
     auto maxHost = [&]() {
@@ -123,9 +150,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
         resultDevice = deviceVector.max();
     };
     benchmark.setOperation( "max", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", maxHost,
-                    "GPU", maxCuda );
+    benchmark.time( reset1, "CPU", maxHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", maxCuda );
+#endif
 
 
     auto minHost = [&]() {
@@ -135,9 +163,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
         resultDevice = deviceVector.min();
     };
     benchmark.setOperation( "min", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", minHost,
-                    "GPU", minCuda );
+    benchmark.time( reset1, "CPU", minHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", minCuda );
+#endif
 
 
     auto absMaxHost = [&]() {
@@ -156,12 +185,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "absMax", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", absMaxHost,
-                    "GPU", absMaxCuda );
+    benchmark.time( reset1, "CPU", absMaxHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", absMaxCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", absMaxCublas );
 #endif
+#endif
 
 
     auto absMinHost = [&]() {
@@ -180,12 +210,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "absMin", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", absMinHost,
-                    "GPU", absMinCuda );
+    benchmark.time( reset1, "CPU", absMinHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", absMinCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", absMinCublas );
 #endif
+#endif
 
 
     auto sumHost = [&]() {
@@ -195,9 +226,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
         resultDevice = deviceVector.sum();
     };
     benchmark.setOperation( "sum", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", sumHost,
-                    "GPU", sumCuda );
+    benchmark.time( reset1, "CPU", sumHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", sumCuda );
+#endif
 
 
     auto l1normHost = [&]() {
@@ -214,12 +246,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "l1 norm", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", l1normHost,
-                    "GPU", l1normCuda );
+    benchmark.time( reset1, "CPU", l1normHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", l1normCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", l1normCublas );
 #endif
+#endif
 
 
     auto l2normHost = [&]() {
@@ -236,12 +269,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "l2 norm", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", l2normHost,
-                    "GPU", l2normCuda );
+    benchmark.time( reset1, "CPU", l2normHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", l2normCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", l2normCublas );
 #endif
+#endif
 
 
     auto l3normHost = [&]() {
@@ -251,9 +285,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
         resultDevice = deviceVector.lpNorm( 3.0 );
     };
     benchmark.setOperation( "l3 norm", datasetSize );
-    benchmark.time( reset1,
-                    "CPU", l3normHost,
-                    "GPU", l3normCuda );
+    benchmark.time( reset1, "CPU", l3normHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", l3normCuda );
+#endif
 
 
     auto scalarProductHost = [&]() {
@@ -271,11 +306,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
     };
 #endif
     benchmark.setOperation( "scalar product", 2 * datasetSize );
-    benchmark.time( reset1,
-                    "CPU", scalarProductHost,
-                    "GPU", scalarProductCuda );
+    benchmark.time( reset1, "CPU", scalarProductHost );
+#ifdef HAVE_CUDA
+    benchmark.time( reset1, "GPU", scalarProductCuda );
 #ifdef HAVE_CUBLAS
     benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+#endif
 #endif
 
     /*
diff --git a/tests/unit-tests/solver/tnlMersonSolverTester.h b/tests/unit-tests/solver/tnlMersonSolverTester.h
index 838da12e964de24f13276888252bbbb9a99f2ad4..f158a62515b2b28b107f5c1b9521b4e4094bce81 100644
--- a/tests/unit-tests/solver/tnlMersonSolverTester.h
+++ b/tests/unit-tests/solver/tnlMersonSolverTester.h
@@ -99,7 +99,7 @@ class MersonTester : public CppUnit :: TestCase
       return suiteOfTests;
    };
 
-   void GetExplicitRHS( const Real& time,
+   void getExplicitUpdate( const Real& time,
                         GridOld< 2, Real, Devices::Host, int >& u,
                         GridOld< 2, Real, Devices::Host, int >& fu )
    {
@@ -115,7 +115,7 @@ class MersonTester : public CppUnit :: TestCase
          }
    }
 
-   void GetExplicitRHS( const Real& time,
+   void getExplicitUpdate( const Real& time,
                         GridOld< 2, Real, Devices::Cuda, int >& u,
                         GridOld< 2, Real, Devices::Cuda, int >& fu )
    {
diff --git a/tests/unit-tests/solver/tnlSolverTester.h b/tests/unit-tests/solver/tnlSolverTester.h
index 05bc787b065acea7fa206a8c8350f773ead8d33c..62c489727c432fe6f9f1c1c3171aa7b2ee8a316c 100644
--- a/tests/unit-tests/solver/tnlSolverTester.h
+++ b/tests/unit-tests/solver/tnlSolverTester.h
@@ -52,7 +52,7 @@ class SolverTesterProblem
 
    DofVectorType& getDofVector() { return this->dofVector;};
 
-   void GetExplicitRHS( const RealType& time,
+   void getExplicitUpdate( const RealType& time,
                         const RealType& tau,
                         DofVectorType& _u,
                         DofVectorType& _fu ){};