diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index d0d1eda9b89edfa7d2c752187976f62c8635913b..294006c088765aea26ae189199f54fc49bb19ecb 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -6,12 +6,14 @@ ELSE()
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
 ENDIF()
 
-IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
-   ParallelForExample.out
- )
-ELSE()
+ADD_EXECUTABLE(staticForExample staticForExample.cpp)
+ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out )
+
+ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp)
+ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out )
+
 ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    ParallelForExample.out
- )
-ENDIF()
\ No newline at end of file
+   unrolledForExample.out
+   staticForExample.out
+)
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
index aafff2466415ed5f710eb59fd25dc1925b959b1e..ecc53948c4823dcf9fd633e08a61bd6cb32dad22 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
@@ -13,10 +12,12 @@ void initMeshFunction( const int xSize,
                        Vector< double, Device >& v,
                        const double& c )
 {
-   auto view = v1.getConstView();
-   auto init = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
-      view[ j * xSize + i ] =  c; };
-   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
+   auto view = v.getView();
+   auto init = [=] __cuda_callable__ ( int i, int j ) mutable
+   {
+      view[ j * xSize + i ] = c;
+   };
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init );
 }
 
 int main( int argc, char* argv[] )
@@ -42,4 +43,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
index 3cb9b5b6482c8270e53b0bca9708705a8b3c28c4..8eb5ff3157178362c8d057a9024c8c4d82dd734c 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
@@ -14,16 +13,18 @@ void initMeshFunction( const int xSize,
                        Vector< double, Device >& v,
                        const double& c )
 {
-   auto view = v1.getConstView();
-   auto init = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
-      view[ ( k * ySize + j ) * xSize + i ] =  c; };
-   ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
+   auto view = v.getView();
+   auto init = [=] __cuda_callable__ ( int i, int j, int k ) mutable
+   {
+      view[ ( k * ySize + j ) * xSize + i ] = c;
+   };
+   ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize, zSize, init );
 }
 
 int main( int argc, char* argv[] )
 {
    /***
-    * Define dimensions of 2D mesh function.
+    * Define dimensions of a 3D mesh function.
     */
    const int xSize( 10 ), ySize( 10 ), zSize( 10 );
    const int size = xSize * ySize * zSize;
@@ -43,4 +44,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
index 9c056fa1d2800e337eba71f78ee8d2c0f6f594cd..dd818856bc7e493bffa58269de997207acfc066b 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -1,10 +1,10 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Algorithms;
 
 /****
  * Set all elements of the vector v to the constant c.
@@ -14,10 +14,11 @@ void initVector( Vector< double, Device >& v,
                  const double& c )
 {
    auto view = v.getView();
-   auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      view[ i ] = c; };
-
-   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+   auto init = [=] __cuda_callable__ ( int i ) mutable
+   {
+      view[ i ] = c;
+   };
+   ParallelFor< Device >::exec( 0, v.getSize(), init );
 }
 
 int main( int argc, char* argv[] )
@@ -39,4 +40,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
deleted file mode 100644
index a2fce79ae670bcda93da5fa6c1a3e95d1d260475..0000000000000000000000000000000000000000
--- a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-const int Size( 5 );
-
-template< int I >
-struct LoopBody
-{
-   static void exec( const StaticVector< Size, double >& v ) {
-      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
-   }
-};
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Initiate static vector
-    */
-   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
-
-   /****
-    * Print out the vector using template parameters for indexing.
-    */
-   Algorithms::TemplateStaticFor< 0, Size, LoopBody >::exec( v );
-}
-
diff --git a/Documentation/Examples/Algorithms/staticForExample.cpp b/Documentation/Examples/Algorithms/staticForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3c4a68f45b6db7b2ee5cee6bb41a76f9253b474
--- /dev/null
+++ b/Documentation/Examples/Algorithms/staticForExample.cpp
@@ -0,0 +1,16 @@
+#include <iostream>
+#include <array>
+#include <TNL/Algorithms/staticFor.h>
+
+int main( int argc, char* argv[] )
+{
+   // initiate std::array
+   std::array< int, 5 > a{ 1, 2, 3, 4, 5 };
+
+   // print out the array using template parameters for indexing
+   TNL::Algorithms::staticFor< int, 0, 5 >(
+      [&a] ( auto i ) {
+         std::cout << "a[ " << i << " ] = " << std::get< i >( a ) << std::endl;
+      }
+   );
+}
diff --git a/Documentation/Examples/Algorithms/StaticForExample.cpp b/Documentation/Examples/Algorithms/unrolledForExample.cpp
similarity index 69%
rename from Documentation/Examples/Algorithms/StaticForExample.cpp
rename to Documentation/Examples/Algorithms/unrolledForExample.cpp
index 47757458d71505fa585f3624575bcdaa55869602..912029e3e108cb0bdfc502383b7c9ac5c30fe494 100644
--- a/Documentation/Examples/Algorithms/StaticForExample.cpp
+++ b/Documentation/Examples/Algorithms/unrolledForExample.cpp
@@ -1,7 +1,6 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
@@ -20,9 +19,12 @@ int main( int argc, char* argv[] )
    /****
     * Compute an addition of a vector and a constant number.
     */
-   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
-   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&]( int i ) {
+         a[ i ] = b[ i ] + 3.14;
+         sum += a[ i ];
+      }
+   );
    std::cout << "a = " << a << std::endl;
    std::cout << "sum = " << sum << std::endl;
 }
-
diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index ca8662ad0aaa59c7be58ea5c1db3b92fddadde28..29ba5a5dfc33b4fb6bbc88edd7f467e136f602ae 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -28,6 +28,9 @@ ADD_EXECUTABLE( ObjectExample_getType ObjectExample_getType.cpp )
 ADD_CUSTOM_COMMAND( COMMAND ObjectExample_getType > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ObjectExample_getType.out OUTPUT ObjectExample_getType.out )
 
 ADD_EXECUTABLE( ParameterContainerExample ParameterContainerExample.cpp )
+ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
+ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
+ADD_EXECUTABLE( MathExample MathExample.cpp )
 
 ADD_EXECUTABLE( ParseObjectTypeExample ParseObjectTypeExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND ParseObjectTypeExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParseObjectTypeExample.out OUTPUT ParseObjectTypeExample.out )
diff --git a/src/Examples/ConfigDescriptionExample.cpp b/Documentation/Examples/ConfigDescriptionExample.cpp
similarity index 100%
rename from src/Examples/ConfigDescriptionExample.cpp
rename to Documentation/Examples/ConfigDescriptionExample.cpp
diff --git a/src/Examples/LoggerExample.cpp b/Documentation/Examples/LoggerExample.cpp
similarity index 100%
rename from src/Examples/LoggerExample.cpp
rename to Documentation/Examples/LoggerExample.cpp
diff --git a/src/Examples/MathExample.cpp b/Documentation/Examples/MathExample.cpp
similarity index 100%
rename from src/Examples/MathExample.cpp
rename to Documentation/Examples/MathExample.cpp
diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 5511d063369ab83aebcb680e50e09d402673d75b..05ed1f33cc0ea1fd0c653439b9cd40f50a34510c 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory( GeneralConcepts )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
-add_subdirectory( ForLoops )
 add_subdirectory( Pointers )
 add_subdirectory( Matrices )
 add_subdirectory( Meshes )
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
deleted file mode 100644
index 74d8d1b0f297fc4939489bd5bae63c346d5946f7..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
-   CUDA_ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cu )
-   CUDA_ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cu )
-ELSE()
-   ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D_ug.cpp )
-   ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D_ug.cpp )
-ENDIF()
-
-ADD_EXECUTABLE( StaticForExample StaticForExample_ug.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
-
-ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample_ug.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
-
-IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
-   ParallelForExample.out
-   StaticForExample.out
-   TemplateStaticForExample.out )
-ENDIF()
diff --git a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
deleted file mode 100644
index 40f29313a6189b8576ae7f9a80ee48ce5f9eda39..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-for( Index j = startY; j < endY; j++ )
-   for( Index i = startX; i < endX; i++ )
-      f( i, j, args... );
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
deleted file mode 120000
index 4a443ad3b663a61c99d0dc4650847cfdcdf73c40..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample-2D_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp
deleted file mode 100644
index 388c326ec2085d10ba50d265ee6bd8763310ae5a..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void meshFunctionSum( const int xSize,
-                      const int ySize,
-                      const Vector< double, Device >& v1,
-                      const Vector< double, Device >& v2,
-                      const double& c,
-                      Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
-      const int idx = j * xSize + i;
-      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
-
-   Algorithms::ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Define dimensions of 2D mesh function.
-    */
-   const int xSize( 10 ), ySize( 10 );
-   const int size = xSize * ySize;
-
-   /***
-    * Firstly, test the mesh functions sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
-   host_v1 = 1.0;
-   host_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, host_v1, host_v2, 2.0, host_result );
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
-   cuda_v1 = 1.0;
-   cuda_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, cuda_v1, cuda_v2, 2.0, cuda_result );
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
deleted file mode 120000
index 79ef7851f4d5bb1b34c9e9a47e7060a51ae04465..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample-3D_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp
deleted file mode 100644
index 37e07c75ec1857d6792c43b11c1daf022dd1d2e9..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void meshFunctionSum( const int xSize,
-                      const int ySize,
-                      const int zSize,
-                      const Vector< double, Device >& v1,
-                      const Vector< double, Device >& v2,
-                      const double& c,
-                      Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
-      const int idx = ( k * ySize + j ) * xSize + i;
-      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
-
-   Algorithms::ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Define dimensions of 3D mesh function.
-    */
-   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
-   const int size = xSize * ySize * xSize;
-
-   /***
-    * Firstly, test the mesh functions sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
-   host_v1 = 1.0;
-   host_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, zSize, host_v1, host_v2, 2.0, host_result );
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
-   cuda_v1 = 1.0;
-   cuda_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, zSize, cuda_v1, cuda_v2, 2.0, cuda_result );
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cu b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
deleted file mode 120000
index 79d405285e5a8e6f874b0af742b2f12f956402a7..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp
deleted file mode 100644
index cf91d69ed25f3c51d33eeae677d63fd628a83708..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void vectorSum( const Vector< double, Device >& v1,
-                const Vector< double, Device >& v2,
-                const double& c,
-                Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
-
-   Algorithms::ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the vectors sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
-   host_v1 = 1.0;
-   host_v2.forAllElements( []__cuda_callable__ ( int i, double& value ) { value = i; } );
-   vectorSum( host_v1, host_v2, 2.0, host_result );
-   std::cout << "host_v1 = " << host_v1 << std::endl;
-   std::cout << "host_v2 = " << host_v2 << std::endl;
-   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
-   cuda_v1 = 1.0;
-   cuda_v2.forAllElements( []__cuda_callable__ ( int i, double& value ) { value = i; } );
-   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
-   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
-   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
-   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
deleted file mode 100644
index 7ee4afd72c42e2bf0fd8db28bd3f5e7c3c47cc0f..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-for( int i = 0; i < Size; i++ )
-{
-   a[ i ] = b[ i ] + c; sum += a[ i ];
-};
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
deleted file mode 100644
index 5298b00a138b547d4fac56af327a146771eae13e..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Algorithms::StaticFor< 0, Size, true >::exec( addition, 3.14 );
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp b/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp
deleted file mode 100644
index 47757458d71505fa585f3624575bcdaa55869602..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/StaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Create two static vectors
-    */
-   const int Size( 3 );
-   StaticVector< Size, double > a, b;
-   a = 1.0;
-   b = 2.0;
-   double sum( 0.0 );
-
-   /****
-    * Compute an addition of a vector and a constant number.
-    */
-   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
-   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
-   std::cout << "a = " << a << std::endl;
-   std::cout << "sum = " << sum << std::endl;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp b/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp
deleted file mode 100644
index eb65fd6ccb90580672699a323269888c31b4415b..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-using Index = int;
-const Index Size( 5 );
-
-template< Index I >
-struct LoopBody
-{
-   static void exec( const StaticVector< Size, double >& v ) {
-      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
-   }
-};
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Initiate static vector
-    */
-   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
-
-   /****
-    * Print out the vector using template parameters for indexing.
-    */
-   Algorithms::TemplateStaticFor< Index, 0, Size, LoopBody >::exec( v );
-}
-
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 6b25c6d49818cc7545d49e94745fe5eae13c8cb8..1284783e62fbcc06ca208b42d88aaa0fe101d59d 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -4,24 +4,32 @@
 
 ## Introduction
 
-This tutorial shows how to use different kind of for loops implemented in TNL. Namely, they are:
+This tutorial shows how to use different kind of for-loops implemented in TNL. Namely, they are:
 
-* **Parallel for** is a for loop which can be run in parallel, i.e. all iterations of the loop must be independent. Paralle for can run on both multicore CPUs and GPUs.
-* **n-dimensional Parallel For** is extension of common parallel for into more dimensions.
-* **Static For** is a for loop which is performed sequentialy and it is explicitly unrolled by C++ templates. Number of iterations must be static (known at compile time).
-* **Templated Static For** ....
+* **Parallel for** is a for-loop which can be run in parallel, i.e. all iterations of the loop must be independent. Parallel for can be run on both multicore CPUs and GPUs.
+* **n-dimensional parallel for** is an extension of common parallel for into higher dimensions.
+* **Unrolled for** is a for-loop which is performed sequentially and it is explicitly unrolled by C++ templates. Iteration bounds must be static (known at compile time).
+* **Static for** is a for-loop with static bounds (known at compile time) and indices usable in constant expressions.
 
 ## Parallel For
 
-Basic parallel for construction in TNL serves for hardware platform transparent expression of parallel for loops. The hardware platform is expressed by a template parameter. The parallel for is defined as:
+Basic _parallel for_ construction in TNL serves for hardware platform transparent expression of parallel for-loops.
+The hardware platform is specified by a template parameter.
+The loop is implemented as \ref TNL::Algorithms::ParallelFor and can be used as:
 
 ```
 ParallelFor< Device >::exec( start, end, function, arguments... );
 ```
 
-The `Device` can be either `Devices::Host` or `Devices::Cuda`. The first two parameters define the loop bounds in the C style. It means that there will be iterations for indexes `start` ... `end-1`. Function is a lambda function to be performed in each iteration. It is supposed to receive the iteration index and arguments passed to the parallel for (the last arguments). See the following example:
+The `Device` can be either \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+The first two parameters define the loop bounds in the C style.
+It means that there will be iterations for indices `start`, `start+1`, ..., `end-1`.
+The `function` is a lambda function to be called in each iteration.
+It is supposed to receive the iteration index and arguments passed to the _parallel for_ (the last arguments).
 
-\include ParallelForExample_ug.cpp
+See the following example:
+
+\include ParallelForExample.cpp
 
 The result is:
 
@@ -29,68 +37,94 @@ The result is:
 
 ## n-dimensional Parallel For
 
-Performing for-loops in higher dimensions is simillar. In the following example we build 2D mesh function on top of TNL vector. Two dimensional indexes `( i, j )` are mapped to vector index `idx` as `idx = j * xSize + i`, where the mesh fuction has dimensions `xSize * ySize`. Of course, in this simple example, it does not make any sense to compute a sum of two mesh function this way, it is only an example.
+For-loops in higher dimensions can be performed similarly via \ref TNL::Algorithms::ParallelFor2D and \ref TNL::Algorithms::ParallelFor3D.
+In the following example we build a 2D mesh function on top of \ref TNL::Containers::Vector.
+Two dimensional indices `( i, j )` are mapped to the vector index `idx` as `idx = j * xSize + i`, where the mesh function has dimensions `xSize * ySize`.
+The following simple example performs initiation of the mesh function with a constant value `c = 1.0`:
 
-\include ParallelForExample-2D_ug.cpp
+\include ParallelForExample-2D.cpp
 
-Notice the parameters of the lambda function `sum`. The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to acces the vector elements sequentialy on CPU and to fullfill coalesced memory accesses on GPU. The for-loop is executed by calling `ParallelFor2D` with proper device. The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embeded for loops:
+Notice the parameters of the lambda function `init`.
+The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to access the vector elements sequentially on CPU and to fulfill coalesced memory accesses on GPU.
+The for-loop is executed by calling `ParallelFor2D` with proper device.
+The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embedded for-loops:
 
-\include ParallelFor2D-snippet.cpp
+```cpp
+for( Index j = startY; j < endY; j++ )
+   for( Index i = startX; i < endX; i++ )
+      f( i, j, args... );
+```
 
-where `args...` stand for additional arguments passed to the for-loop. After the parameters defining the loops bounds, lambda function (`sum` in this case) is passed followed by additional arguments. One of them, in our example, is `xSize` again because it must be passed to the lambda function for the index mapping computation.
+where `args...` stand for additional arguments passed to the for-loop.
+After the parameters defining the loops bounds, lambda function (`init` in this case) is passed, followed by additional arguments that are forwarded to the lambda function after the iteration indices.
+In the example above there are no additional arguments, since the lambda function `init` captures all variables it needs to work with.
 
-For the completness, we show modification of the previous example into 3D:
+For completeness, we show modification of the previous example into 3D:
 
-\include ParallelForExample-3D_ug.cpp
+\include ParallelForExample-3D.cpp
 
-## Static For
+## Unrolled For
 
-Static for-loop is designed for short loops with constant (i.e. known at the compile time) number of iterations. It is often used with static arrays and vectors. An adventage of this kind of for loop is that it is explicitly unrolled when the loop is short (up to eight iterations). See the following example:
+\ref TNL::Algorithms::unrolledFor is a for-loop that it is explicitly unrolled via C++ templates when the loop is short (up to eight iterations).
+The bounds of `unrolledFor` loops must be constant (i.e. known at the compile time).
+It is often used with static arrays and vectors.
 
-\include StaticForExample_ug.cpp
+See the following example:
 
-Notice that the static for-loop works with a lambda function simillar to parallel for-loop. The bounds of the loop are passed as template parameters in the statement `Algorithms::StaticFor< 0, Size >`. The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliar data to be passed to the function. The function gets the loop index `i` first followed by the auxiliary data `sum` in this example.
+\include unrolledForExample.cpp
+
+Notice that the unrolled for-loop works with a lambda function similar to parallel for-loop.
+The bounds of the loop are passed as template parameters in the statement `Algorithms::unrolledFor< int, 0, Size >`.
+The parameter of the `unrolledFor` function is the functor to be called in each iteration.
+The function gets the loop index `i` only, see the following example:
 
 The result looks as:
 
-\include StaticForExample.out
+\include unrolledForExample.out
 
-The effect of `StaticFor` is really the same as usual for-loop. The following code does the same as the previous example:
+The effect of `unrolledFor` is really the same as usual for-loop.
+The following code does the same as the previous example:
 
-\include StaticForExample-2.cpp
+```cpp
+for( int i = 0; i < Size; i++ )
+{
+   a[ i ] = b[ i ] + 3.14;
+   sum += a[ i ];
+};
+```
 
-The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some situations. `StaticFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
+The benefit of `unrolledFor` is mainly in the explicit unrolling of short loops which can improve performance in some situations.
+The maximum length of loops that will be fully unrolled can be specified using the fourth template parameter as follows:
 
-\include StaticForExample-3.cpp
+```cpp
+Algorithms::unrolledFor< int, 0, Size, 16 >( ... );
+```
 
-`StaticFor` can be used also in CUDA kernels.
+`unrolledFor` can be used also in CUDA kernels.
 
-## Templated Static For
+## Static For
 
-Templated static for-loop (`TemplateStaticFor`) is a for-loop in template parameters. For example, if class `LoopBody` is defined as
+\ref TNL::Algorithms::staticFor is a generic for-loop whose iteration indices are usable in constant expressions (e.g. template arguments). It can be used as
 
-```
-template< int i >
-struct LoopBody
-{
-   static void exec() { ... };
-}
+```cpp
+staticFor< int, 0, N >( f );
 ```
 
-one might need to execute the following sequence of statements:
+which is results in the following sequence of function calls:
 
-```
-LoopBody< 0 >::exec();
-LoopBody< 1 >::exec();
-LoopBody< 3 >::exec();
+```cpp
+f( std::integral_constant< 0 >{} );
+f( std::integral_constant< 1 >{} );
+f( std::integral_constant< 2 >{} );
+f( std::integral_constant< 3 >{} );
 ...
-LoodBody< N >::exec();
+f( std::integral_constant< N >{} );
 ```
 
-This is exactly what `TemplateStaticFor` can do - in a slightly more general way. See the following example:
+Notice that each iteration index is represented by its own distinct type using \ref std::integral_constant. Hence, the functor `f` must be generic, e.g. a _generic lambda expression_ such as in the following example:
 
-\include TemplateStaticForExample.cpp
+\include staticForExample.cpp
 
 The output looks as follows:
 
-\include TemplateStaticForExample.out
+\include staticForExample.out
diff --git a/src/Examples/CMakeLists.txt b/src/Examples/CMakeLists.txt
index 65281587449127e9d16787bcda8af234e562bcd5..ee71c4b5ec28aa8785b497e14451991a7dba357c 100644
--- a/src/Examples/CMakeLists.txt
+++ b/src/Examples/CMakeLists.txt
@@ -10,8 +10,3 @@ add_subdirectory( inviscid-flow-vl )
 add_subdirectory( flow )
 add_subdirectory( flow-sw )
 add_subdirectory( flow-vl )
-
-
-ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
-ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
-ADD_EXECUTABLE( MathExample MathExample.cpp )
diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp
index 0bfead2871a5d216522845680c02912cdcd1d8b6..4eb8a93695ecd1a7ee1792873745c7c93486c207 100644
--- a/src/TNL/Algorithms/Multireduction.hpp
+++ b/src/TNL/Algorithms/Multireduction.hpp
@@ -19,7 +19,7 @@
 #include <TNL/Assert.h>
 #include <TNL/Algorithms/Multireduction.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
-#include <TNL/Algorithms/CudaMultireductionKernel.h>
+#include <TNL/Algorithms/detail/CudaMultireductionKernel.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
 #include <TNL/Timer.h>
@@ -212,7 +212,7 @@ reduce( const Result zero,
 
    // start the reduction on the GPU
    Result* deviceAux1 = nullptr;
-   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
+   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index cb096f8798176970b19770f5f9315abc3fdbcee2..97cf13c778d9aec73b866f1a3fe08b73acf5e171 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -53,10 +53,11 @@ enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
 
 /**
- * \brief Parallel for loop for one dimensional interval of indexes.
+ * \brief Parallel for loop for one dimensional interval of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -64,16 +65,17 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param start the for-loop iterates over index interval [start, end).
-    * \param end the for-loop iterates over index interval [start, end).
-    * \param f is the function to be called in each iteration
+    * \param start is the left bound of the iteration range `[begin, end)`.
+    * \param end is the right bound of the iteration range `[begin, end)`.
+    * \param f is the function to be called in each iteration.
     * \param args are additional parameters to be passed to the function f.
     *
     * \par Example
@@ -93,10 +95,11 @@ struct ParallelFor
 };
 
 /**
- * \brief Parallel for loop for two dimensional domain of indexes.
+ * \brief Parallel for loop for two dimensional domain of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -104,23 +107,26 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor2D
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param startX the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param startY the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param endX the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param endY the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
     *
     * The function f is called for each iteration as
     *
+    * \code
     * f( i, j, args... )
+    * \endcode
     *
     * where the first parameter is changing more often than the second one.
     *
@@ -142,10 +148,11 @@ struct ParallelFor2D
 };
 
 /**
- * \brief Parallel for loop for three dimensional domain of indexes.
+ * \brief Parallel for loop for three dimensional domain of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -153,25 +160,28 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor3D
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startX the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param startY the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param startZ the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endX the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endY the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endZ the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
     *
     * The function f is called for each iteration as
     *
+    * \code
     * f( i, j, k, args... )
+    * \endcode
     *
     * where the first parameter is changing the most often.
     *
diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/Reduction.hpp
index 7873f9c3c4268fbbec0cd7757bcfca0dade40869..3c029d9ab514f741999f6d078d99f3f333092c20 100644
--- a/src/TNL/Algorithms/Reduction.hpp
+++ b/src/TNL/Algorithms/Reduction.hpp
@@ -17,7 +17,7 @@
 //#define CUDA_REDUCTION_PROFILING
 
 #include <TNL/Algorithms/Reduction.h>
-#include <TNL/Algorithms/CudaReductionKernel.h>
+#include <TNL/Algorithms/detail/CudaReductionKernel.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
@@ -311,7 +311,7 @@ reduce( const Index begin,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
+   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
 
    // start the reduce on the GPU
    Result* deviceAux1( 0 );
@@ -401,7 +401,7 @@ reduceWithArgument( const Index begin,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
+   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
 
    // start the reduce on the GPU
    Result* deviceAux1( nullptr );
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 74351077ebf85fa5c222c639f5503214cdb1844a..78d5eaf60ecc50833552feac667d0eb941f06bfb 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -17,7 +17,7 @@
 #include <TNL/Assert.h>
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/StaticArray.h>
-#include <TNL/Algorithms/CudaScanKernel.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -227,7 +227,7 @@ perform( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
+   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -253,7 +253,7 @@ performFirstPhase( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   return CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -281,7 +281,7 @@ performSecondPhase( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
       end - begin,
       &v.getData()[ begin ],  // output
       blockShifts.getData(),
diff --git a/src/TNL/Algorithms/StaticFor.h b/src/TNL/Algorithms/StaticFor.h
deleted file mode 100644
index 6a450638f49ed22bd899af2397df602d98ff74a3..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/StaticFor.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/***************************************************************************
-                          StaticFor.h  -  description
-                             -------------------
-    begin                : Jul 16, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Cuda/CudaCallable.h>
-
-namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief StaticFor is a wrapper for common for-loop with explicit unrolling.
- *
- * StaticFor can be used only for for-loops bounds of which are known at the
- * compile time. StaticFor performs explicit loop unrolling for better performance.
- * This, however, does not make sense for loops with a large iterations
- * count. For a very large iterations count it could trigger the compiler's
- * limit on recursive template instantiation. Also note that the compiler
- * will (at least partially) unroll loops with static bounds anyway. For theses
- * reasons, the explicit loop unrolling can be controlled by the third template
- * parameter.
- *
- * \tparam Begin the loop will iterate over indexes [Begin,End)
- * \tparam End the loop will iterate over indexes [Begin,End)
- * \tparam unrolled controls the explicit loop unrolling. If it is true, the
- *   unrolling is performed.
- *
- * \par Example
- * \include Algorithms/StaticForExample.cpp
- * \par Output
- * \include StaticForExample.out
- */
-template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
-struct StaticFor;
-
-template< int Begin, int End >
-struct StaticFor< Begin, End, true >
-{
-   static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
-
-   /**
-    * \brief Static method for execution od the StaticFor.
-    *
-    * \param f is a (lambda) function to be performed in each iteration.
-    * \param args are auxiliary data to be passed to the function f.
-    */
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args )
-   {
-      f( Begin, args... );
-      StaticFor< Begin + 1, End >::exec( f, std::forward< Args >( args )... );
-   }
-};
-
-template< int End >
-struct StaticFor< End, End, true >
-{
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args ) {}
-};
-
-template< int Begin, int End >
-struct StaticFor< Begin, End, false >
-{
-   static_assert( Begin <= End, "Wrong index interval for StaticFor. Begin must be less than or equal to end." );
-
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args )
-   {
-      for( int i = Begin; i < End; i++ )
-         f( i, std::forward< Args >( args )... );
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/StaticVectorFor.h b/src/TNL/Algorithms/StaticVectorFor.h
deleted file mode 100644
index 664f97aed95651249447788d62a6f19be8855bd6..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/StaticVectorFor.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/***************************************************************************
-                          StaticVectorFor.h  -  description
-                             -------------------
-    begin                : July 12, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Containers/StaticVector.h>
-
-namespace TNL {
-namespace Algorithms {
-
-struct StaticVectorFor
-{
-   template< typename Index,
-             typename Function,
-             typename... FunctionArgs,
-             int dim >
-   static void exec( const Containers::StaticVector< dim, Index >& begin,
-                     const Containers::StaticVector< dim, Index >& end,
-                     Function f,
-                     FunctionArgs... args )
-   {
-      static_assert( 1 <= dim && dim <= 3, "unsupported dimension" );
-      Containers::StaticVector< dim, Index > index;
-
-      if( dim == 1 ) {
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-            f( index, args... );
-      }
-
-      if( dim == 2 ) {
-         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-               f( index, args... );
-      }
-
-      if( dim == 3 ) {
-         for( index[2] = begin[2]; index[2] < end[2]; index[2]++ )
-         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-            f( index, args... );
-      }
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/TemplateStaticFor.h b/src/TNL/Algorithms/TemplateStaticFor.h
deleted file mode 100644
index c96c816dc990aad8b3cb1fc8da9b5cc67b8f110a..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/TemplateStaticFor.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************
-                          TemplateStaticFor.h  -  description
-                             -------------------
-    begin                : Feb 23, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <utility>
-#include <type_traits>
-
-#include <TNL/Cuda/CudaCallable.h>
-
-namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief TemplateStaticFor serves for coding for-loops in template parameters.
- *
- * The result of calling this loop with a templated class \p LoopBody is as follows:
- *
- * LoopBody< begin >::exec( ... );
- *
- * LoodBody< begin + 1 >::exec( ... );
- *
- * ...
- *
- * LoopBody< end - 1 >::exec( ... );
- *
- * \tparam IndexType is type of the loop indexes
- * \tparam begin the loop iterates over index interval [begin,end).
- * \tparam end the loop iterates over index interval [begin,end).
- * \tparam LoopBody is a templated class having one template parameter of IndexType.
- *
- * \par Example
- * \include Algorithms/TamplateStaticForExample.cpp
- * \par Output
- * \include TamplateStaticForExample.out
- */
-template< typename IndexType,
-          IndexType begin,
-          IndexType end,
-          template< IndexType > class LoopBody >
-struct TemplateStaticFor;
-
-namespace detail {
-
-template< typename IndexType,
-          typename Begin,
-          typename N,
-          template< IndexType > class LoopBody >
-struct TemplateStaticForExecutor
-{
-   /**
-    * \brief Static method initiating the for-loop.
-    *
-    * \tparam Args type of user defined data to be passed to for-loop.
-    * \param args user defined data to be passed to for-loop.
-    */
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {
-      using Decrement = std::integral_constant< IndexType, N::value - 1 >;
-      TemplateStaticForExecutor< IndexType, Begin, Decrement, LoopBody >::exec( std::forward< Args >( args )... );
-      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
-   }
-
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {
-      using Decrement = std::integral_constant< IndexType, N::value - 1 >;
-      TemplateStaticForExecutor< IndexType, Begin, Decrement, LoopBody >::execHost( std::forward< Args >( args )... );
-      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
-   }
-};
-
-template< typename IndexType,
-          typename Begin,
-          template< IndexType > class LoopBody >
-struct TemplateStaticForExecutor< IndexType,
-                                  Begin,
-                                  std::integral_constant< IndexType, 0 >,
-                                  LoopBody >
-{
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {}
-
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {}
-};
-
-} // namespace detail
-
-template< typename IndexType,
-          IndexType begin,
-          IndexType end,
-          template< IndexType > class LoopBody >
-struct TemplateStaticFor
-{
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {
-      detail::TemplateStaticForExecutor< IndexType,
-                                 std::integral_constant< IndexType, begin >,
-                                 std::integral_constant< IndexType, end - begin >,
-                                 LoopBody >::exec( std::forward< Args >( args )... );
-   }
-
-   // nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {
-      detail::TemplateStaticForExecutor< IndexType,
-                                 std::integral_constant< IndexType, begin >,
-                                 std::integral_constant< IndexType, end - begin >,
-                                 LoopBody >::execHost( std::forward< Args >( args )... );
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/CudaMultireductionKernel.h b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaMultireductionKernel.h
rename to src/TNL/Algorithms/detail/CudaMultireductionKernel.h
index a7979fb7f10a8af83c81e9639d7bab733c485b5d..973d8e958c66b29b49d46419020ee938c2205143 100644
--- a/src/TNL/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
@@ -21,6 +21,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 #ifdef HAVE_CUDA
 /****
@@ -281,5 +282,6 @@ CudaMultireductionKernelLauncher( const Result zero,
 #endif
 }
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaReductionKernel.h
rename to src/TNL/Algorithms/detail/CudaReductionKernel.h
index c3d981fd57505f8b4c7f52757acb3904c2924324..51f38f18bee4b8645d720538d190bee04b59e265 100644
--- a/src/TNL/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -21,6 +21,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 /****
  * The performance of this kernel is very sensitive to register usage.
@@ -642,5 +643,6 @@ struct CudaReductionKernelLauncher
       Index reducedSize;
 };
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaScanKernel.h
rename to src/TNL/Algorithms/detail/CudaScanKernel.h
index 97912b2343907e504db133ae545f5f420f18f0e3..63072ea893da12f6790f364d7d456f2c84b06945 100644
--- a/src/TNL/Algorithms/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -19,6 +19,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 #ifdef HAVE_CUDA
 
@@ -386,5 +387,6 @@ struct CudaScanKernelLauncher
 
 #endif
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/staticFor.h b/src/TNL/Algorithms/staticFor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f1b9d1e0245145744ee09649ba182697238589d
--- /dev/null
+++ b/src/TNL/Algorithms/staticFor.h
@@ -0,0 +1,113 @@
+/***************************************************************************
+                          staticFor.h  -  description
+                             -------------------
+    begin                : Feb 23, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+namespace TNL {
+namespace Algorithms {
+
+namespace detail {
+
+// special dispatch for `begin >= end` (i.e. empty loop)
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+static_for_dispatch( Func &&f )
+{}
+
+#if __cplusplus >= 201703L
+
+// C++17 version using fold expression
+template< typename Index, Index begin,  typename Func, Index... idx >
+constexpr void static_for_impl( Func &&f, std::integer_sequence< Index, idx... > )
+{
+   ( f( std::integral_constant<Index, begin + idx>{} ), ... );
+}
+
+// general dispatch for `begin < end`
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin < end) >
+static_for_dispatch( Func &&f )
+{
+   static_for_impl< Index, begin >(
+         std::forward< Func >( f ),
+         std::make_integer_sequence< Index, end - begin >{}
+   );
+}
+
+#else
+
+// C++14 version using recursive folding
+// (We avoid manual folding with std::integer_sequence, because it cannot be
+// empty, so it would be rather weird. Folding is done by bisection to limit
+// the recursion depth.)
+
+// special dispatch for 1 iteration
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin == 1) >
+static_for_dispatch( Func &&f )
+{
+   f( std::integral_constant< Index, begin >{} );
+}
+
+// general dispatch for at least 2 iterations
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin >= 2) >
+static_for_dispatch( Func &&f )
+{
+   constexpr Index mid = begin + (end - begin) / 2;
+   static_for_dispatch< Index, begin, mid >( std::forward< Func >( f ) );
+   static_for_dispatch< Index, mid, end >( std::forward< Func >( f ) );
+}
+
+#endif
+
+} // namespace detail
+
+/**
+ * \brief Generic loop with constant bounds and indices usable in constant
+ * expressions.
+ *
+ * \e staticFor is a generic C++14/C++17 implementation of a static for-loop
+ * using \e constexpr functions and template metaprogramming. It is equivalent
+ * to executing a function $f(i)$ for arguments $i$ from the integral range
+ * `[begin, end)`, but with the type \ref std::integral_constant rather than
+ * `int` or `std::size_t` representing the indices. Hence, each index has its
+ * own distinct C++ type and the \e value of the index can be deduced from the
+ * type.
+ *
+ * Also note that thanks to `constexpr`, the argument $i$ can be used in
+ * constant expressions and the \e staticFor function can be used from the host
+ * code as well as CUDA kernels (TNL requires the `--expt-relaxed-constexpr`
+ * parameter when compiled by `nvcc`).
+ *
+ * \tparam Index is the type of the loop indices.
+ * \tparam begin is the left bound of the iteration range `[begin, end)`.
+ * \tparam end is the right bound of the iteration range `[begin, end)`.
+ * \tparam Func is the type of the functor (it is usually deduced from the
+ *    argument used in the function call).
+ *
+ * \param f is the functor to be called in each iteration.
+ *
+ * \par Example
+ * \include Algorithms/staticForExample.cpp
+ * \par Output
+ * \include staticForExample.out
+ */
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr void staticFor( Func&& f )
+{
+   detail::static_for_dispatch< Index, begin, end >( std::forward< Func >( f ) );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/unrolledFor.h b/src/TNL/Algorithms/unrolledFor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e16e402c13367e8351c1111b5980ad2f09976e7
--- /dev/null
+++ b/src/TNL/Algorithms/unrolledFor.h
@@ -0,0 +1,90 @@
+/***************************************************************************
+                          unrolledFor.h  -  description
+                             -------------------
+    begin                : Jul 16, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <utility>
+
+namespace TNL {
+namespace Algorithms {
+
+namespace detail {
+
+// special dispatch for empty loop
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+unrolled_for_dispatch( Func&& f )
+{}
+
+// special dispatch for 1 iteration
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin == 1) >
+unrolled_for_dispatch( Func&& f )
+{
+   f( begin );
+}
+
+// specialization for unrolling short loops (at least 2, but at most unrollFactor iterations)
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin >= 2 && end - begin <= unrollFactor) >
+unrolled_for_dispatch( Func&& f )
+{
+   constexpr Index mid = begin + (end - begin) / 2;
+   unrolled_for_dispatch< Index, begin, mid, unrollFactor >( std::forward< Func >( f ) );
+   unrolled_for_dispatch< Index, mid, end, unrollFactor >( std::forward< Func >( f ) );
+}
+
+// specialization for long loops - normal for-loop
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin > 1 && end - begin > unrollFactor) >
+unrolled_for_dispatch( Func&& f )
+{
+   for( Index i = begin; i < end; i++ )
+      f( i );
+}
+
+} // namespace detail
+
+/**
+ * \brief Generic for-loop with explicit unrolling.
+ *
+ * \e unrolledFor performs explicit loop unrolling of short loops which can
+ * improve performance in some cases. The bounds of the for-loop must be constant
+ * (i.e. known at the compile time). Loops longer than \e unrollFactor are not
+ * unrolled and executed as a normal for-loop.
+ *
+ * The unroll factor is configurable, but note that full unrolling does not
+ * make sense for very long loops. It might even trigger the compiler's limit
+ * on recursive template instantiation. Also note that the compiler will (at
+ * least partially) unroll loops with static bounds anyway.
+ *
+ * \tparam Index is the type of the loop indices.
+ * \tparam begin is the left bound of the iteration range `[begin, end)`.
+ * \tparam end is the right bound of the iteration range `[begin, end)`.
+ * \tparam unrollFactor is the maximum length of loops to fully unroll via
+ *    recursive template instantiation.
+ * \tparam Func is the type of the functor (it is usually deduced from the
+ *    argument used in the function call).
+ *
+ * \param f is the functor to be called in each iteration.
+ *
+ * \par Example
+ * \include Algorithms/unrolledForExample.cpp
+ * \par Output
+ * \include unrolledForExample.out
+ */
+template< typename Index, Index begin, Index end, Index unrollFactor = 8,  typename Func >
+constexpr void unrolledFor( Func&& f )
+{
+   detail::unrolled_for_dispatch< Index, begin, end, unrollFactor >( std::forward< Func >( f ) );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 53c9290cabefa0251a30a489fc28b1c465b1f7e1..77c85c750c08779d3ac65abd538ee8094553e2f5 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -136,7 +136,7 @@ class Array
        * \param size The number of array elements to be allocated.
        * \param allocator The allocator to be associated with this array.
        */
-      explicit Array( const IndexType& size, const AllocatorType& allocator = AllocatorType() );
+      explicit Array( IndexType size, const AllocatorType& allocator = AllocatorType() );
 
       /**
        * \brief Constructs an array with given size and value.
@@ -145,7 +145,7 @@ class Array
        * \param value The value all elements will be set to.
        * \param allocator The allocator to be associated with this array.
        */
-      explicit Array( const IndexType& size, const Value& value, const AllocatorType& allocator = AllocatorType() );
+      explicit Array( IndexType size, ValueType value, const AllocatorType& allocator = AllocatorType() );
 
       /**
        * \brief Constructs an array with given size and copies data from given
@@ -155,8 +155,8 @@ class Array
        * \param size The number of array elements to be copied to the array.
        * \param allocator The allocator to be associated with this array.
        */
-      Array( Value* data,
-             const IndexType& size,
+      Array( ValueType* data,
+             IndexType size,
              const AllocatorType& allocator = AllocatorType() );
 
       /**
@@ -287,7 +287,7 @@ class Array
        *
        * \param size The new size of the array.
        */
-      void resize( Index size );
+      void resize( IndexType size );
 
       /**
        * \brief Method for resizing the array with an initial value.
@@ -306,7 +306,7 @@ class Array
        * \param size The new size of the array.
        * \param value The value to initialize new elements with.
        */
-      void resize( Index size, const ValueType& value );
+      void resize( IndexType size, ValueType value );
 
       /**
        * \brief Method for setting the array size.
@@ -318,14 +318,14 @@ class Array
        *
        * \param size The new size of the array.
        */
-      void setSize( Index size );
+      void setSize( IndexType size );
 
       /**
        * \brief Returns the current array size.
        *
        * This method can be called from device kernels.
        */
-      __cuda_callable__ Index getSize() const;
+      __cuda_callable__ IndexType getSize() const;
 
       /**
        * \brief Sets the same size as the size of an existing array.
@@ -448,10 +448,10 @@ class Array
        * where the array is allocated.
        *
        * \param i The index of the element to be set.
-       * \param v The new value of the element.
+       * \param value The new value of the element.
        */
       __cuda_callable__
-      void setElement( const Index& i, const Value& v );
+      void setElement( IndexType i, ValueType value );
 
       /**
        * \brief Returns the value of the \e i-th element.
@@ -462,7 +462,7 @@ class Array
        * \param i The index of the element to be returned.
        */
       __cuda_callable__
-      Value getElement( const Index& i ) const;
+      ValueType getElement( IndexType i ) const;
 
       /**
        * \brief Accesses the \e i-th element of the array.
@@ -479,7 +479,7 @@ class Array
        * \param i The index of the element to be accessed.
        * \return Reference to the \e i-th element.
        */
-      __cuda_callable__ Value& operator[]( const Index& i );
+      __cuda_callable__ Value& operator[]( IndexType i );
 
       /**
        * \brief Accesses the \e i-th element of the array.
@@ -496,7 +496,7 @@ class Array
        * \param i The index of the element to be accessed.
        * \return Constant reference to the \e i-th element.
        */
-      __cuda_callable__ const Value& operator[]( const Index& i ) const;
+      __cuda_callable__ const Value& operator[]( IndexType i ) const;
 
       /**
        * \brief Copy-assignment operator for copying data from another array.
@@ -557,7 +557,7 @@ class Array
        *         container, e.g. \ref Array, \ref ArrayView, \ref Vector,
        *         \ref VectorView, etc.
        * \param array Reference to the array-like container.
-       * \return \ref True if both arrays are element-wise equal and \ref false
+       * \return `true` if both arrays are element-wise equal and `false`
        *         otherwise.
        */
       template< typename ArrayT >
@@ -582,13 +582,13 @@ class Array
        * or \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are set.
        *
-       * \param v The new value for the array elements.
+       * \param value The new value for the array elements.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
        */
-      void setValue( const ValueType& v,
+      void setValue( ValueType value,
                      IndexType begin = 0,
                      IndexType end = 0 );
 
@@ -603,8 +603,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -633,8 +633,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -663,8 +663,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -691,8 +691,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -727,7 +727,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -744,7 +744,7 @@ class Array
       template< typename Fetch,
                 typename Reduce,
                 typename Result >
-      Result reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
+      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
 
        /**
         * \brief Computes reduction with array elements on interval [ \e begin, \e end) for constant instances.
@@ -765,7 +765,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -782,7 +782,7 @@ class Array
       template< typename Fetch,
                 typename Reduce,
                 typename Result >
-      Result reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
+      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
 
        /**
         * \brief Computes reduction with all array elements.
@@ -801,7 +801,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -837,7 +837,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -863,15 +863,15 @@ class Array
        * \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are checked.
        *
-       * \param v The value to be checked.
+       * \param value The value to be checked.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
-       * \return True if there is _at least one_ element in the sub-interval
-       *         `[begin, end)` which has the value \e v.
+       * \return `true` if there is _at least one_ element in the sub-interval
+       *         `[begin, end)` which has the value \e value.
        */
-      bool containsValue( const ValueType& v,
+      bool containsValue( ValueType value,
                           IndexType begin = 0,
                           IndexType end = 0 ) const;
 
@@ -882,15 +882,15 @@ class Array
        * \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are checked.
        *
-       * \param v The value to be checked.
+       * \param value The value to be checked.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
-       * \return True if there is _all_ elements in the sub-interval
-       *         `[begin, end)` have the same value \e v.
+       * \return `true` if _all_ elements in the sub-interval `[begin, end)`
+       *         have the same value \e value.
        */
-      bool containsOnlyValue( const ValueType& v,
+      bool containsOnlyValue( ValueType value,
                               IndexType begin = 0,
                               IndexType end = 0 ) const;
 
@@ -919,13 +919,13 @@ class Array
       /** \brief Internal method for reallocating array elements. Used only
        * from the two overloads of \ref resize.
        */
-      void reallocate( Index size );
+      void reallocate( IndexType size );
 
       /** \brief Pointer to the data. */
       Value* data = nullptr;
 
       /** \brief Number of elements in the array. */
-      Index size = 0;
+      IndexType size = 0;
 
       /**
        * \brief The internal allocator instance.
@@ -941,7 +941,7 @@ class Array
  * \tparam Index is a type used for the indexing of the array elements.
  *
  * \param str is a output stream.
- * \param view is the array to be printed.
+ * \param array is the array to be printed.
  *
  * \return a reference on the output stream \ref std::ostream&.
  */
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index f313a4cf522ebc430e13e810fd354b08a1f0acf1..d310534a30c7d9562d09252341e089121aa40323 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -52,7 +52,7 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( const IndexType& size, const AllocatorType& allocator )
+Array( IndexType size, const AllocatorType& allocator )
 : allocator( allocator )
 {
    this->setSize( size );
@@ -63,7 +63,7 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( const IndexType& size, const Value& value, const AllocatorType& allocator )
+Array( IndexType size, ValueType value, const AllocatorType& allocator )
 : allocator( allocator )
 {
    this->setSize( size );
@@ -75,8 +75,8 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( Value* data,
-       const IndexType& size,
+Array( ValueType* data,
+       IndexType size,
        const AllocatorType& allocator )
 : allocator( allocator )
 {
@@ -244,7 +244,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-reallocate( Index size )
+reallocate( IndexType size )
 {
    TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." );
 
@@ -288,10 +288,10 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-resize( Index size )
+resize( IndexType size )
 {
    // remember the old size and reallocate the array
-   const Index old_size = this->size;
+   const IndexType old_size = this->size;
    reallocate( size );
 
    if( old_size < size )
@@ -306,10 +306,10 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-resize( Index size, const ValueType& value )
+resize( IndexType size, ValueType value )
 {
    // remember the old size and reallocate the array
-   const Index old_size = this->size;
+   const IndexType old_size = this->size;
    reallocate( size );
 
    if( old_size < size )
@@ -323,7 +323,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-setSize( Index size )
+setSize( IndexType size )
 {
    TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." );
 
@@ -495,7 +495,7 @@ template< typename Value,
           typename Allocator >
 __cuda_callable__ void
 Array< Value, Device, Index, Allocator >::
-setElement( const Index& i, const Value& x )
+setElement( IndexType i, ValueType x )
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -508,7 +508,7 @@ template< typename Value,
           typename Allocator >
 __cuda_callable__ Value
 Array< Value, Device, Index, Allocator >::
-getElement( const Index& i ) const
+getElement( IndexType i ) const
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -522,7 +522,7 @@ template< typename Value,
 __cuda_callable__
 Value&
 Array< Value, Device, Index, Allocator >::
-operator[]( const Index& i )
+operator[]( IndexType i )
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -541,7 +541,7 @@ template< typename Value,
 __cuda_callable__
 const Value&
 Array< Value, Device, Index, Allocator >::
-operator[]( const Index& i ) const
+operator[]( IndexType i ) const
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -647,9 +647,7 @@ operator==( const ArrayT& array ) const
    if( this->getSize() == 0 )
       return true;
    return Algorithms::MultiDeviceMemoryOperations< Device, typename ArrayT::DeviceType >::
-            compare( this->getData(),
-                           array.getData(),
-                           array.getSize() );
+            compare( this->getData(), array.getData(), array.getSize() );
 }
 
 template< typename Value,
@@ -670,7 +668,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-setValue( const ValueType& v,
+setValue( ValueType v,
           IndexType begin,
           IndexType end )
 {
@@ -742,7 +740,7 @@ template< typename Value,
          typename Result >
 Result
 Array< Value, Device, Index, Allocator >::
-reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    return this->getView().reduceElements( begin, end, fetch, reduce, zero );
 }
@@ -756,7 +754,7 @@ template< typename Value,
          typename Result >
 Result
 Array< Value, Device, Index, Allocator >::
-reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    return this->getConstView().reduceElements( begin, end, fetch, reduce, zero );
 }
@@ -795,7 +793,7 @@ template< typename Value,
           typename Allocator >
 bool
 Array< Value, Device, Index, Allocator >::
-containsValue( const ValueType& v,
+containsValue( ValueType value,
                IndexType begin,
                IndexType end ) const
 {
@@ -803,7 +801,7 @@ containsValue( const ValueType& v,
    if( end == 0 )
       end = this->getSize();
 
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, v );
+   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
 }
 
 template< typename Value,
@@ -812,7 +810,7 @@ template< typename Value,
           typename Allocator >
 bool
 Array< Value, Device, Index, Allocator >::
-containsOnlyValue( const ValueType& v,
+containsOnlyValue( ValueType value,
                    IndexType begin,
                    IndexType end ) const
 {
@@ -820,7 +818,7 @@ containsOnlyValue( const ValueType& v,
    if( end == 0 )
       end = this->getSize();
 
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, v );
+   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 4b38460375abe28012bd76e29fa929580a13e5bf..eb7e548b074ef26225de8382fa8345107e87e0b2 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -119,7 +119,7 @@ public:
     * \param size The number of elements in the array view.
     */
    __cuda_callable__
-   ArrayView( Value* data, Index size );
+   ArrayView( ValueType* data, IndexType size );
 
    /**
     * \brief Shallow copy constructor.
@@ -165,7 +165,7 @@ public:
     * \param size The number of elements in the array view.
     */
    __cuda_callable__
-   void bind( Value* data, const Index size );
+   void bind( ValueType* data, IndexType size );
 
    /**
     * \brief Method for rebinding (reinitialization) using another array view.
@@ -193,7 +193,7 @@ public:
     *            which is, however, replaced with the array size.
     */
    __cuda_callable__
-   ViewType getView( const IndexType begin = 0, IndexType end = 0 );
+   ViewType getView( IndexType begin = 0, IndexType end = 0 );
 
    /**
     * \brief Returns a non-modifiable view of the array view.
@@ -208,7 +208,7 @@ public:
     *            which is, however, replaced with the array size.
     */
    __cuda_callable__
-   ConstViewType getConstView( const IndexType begin = 0, IndexType end = 0 ) const;
+   ConstViewType getConstView( IndexType begin = 0, IndexType end = 0 ) const;
 
    /**
     * \brief Deep copy assignment operator for copying data from another array
@@ -270,7 +270,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   const Value* getData() const;
+   const ValueType* getData() const;
 
    /**
     * \brief Returns a raw pointer to the data.
@@ -278,7 +278,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Value* getData();
+   ValueType* getData();
 
    /**
     * \brief Returns a \e const-qualified raw pointer to the data.
@@ -289,7 +289,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   const Value* getArrayData() const;
+   const ValueType* getArrayData() const;
 
    /**
     * \brief Returns a raw pointer to the data.
@@ -300,7 +300,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Value* getArrayData();
+   ValueType* getArrayData();
 
    /**
     * \brief Returns the current size of the array view.
@@ -308,7 +308,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Index getSize() const;
+   IndexType getSize() const;
 
    /**
     * \brief Sets the value of the \e i-th element to \e v.
@@ -317,10 +317,10 @@ public:
     * where the array is allocated.
     *
     * \param i The index of the element to be set.
-    * \param v The new value of the element.
+    * \param value The new value of the element.
     */
    __cuda_callable__
-   void setElement( Index i, Value value );
+   void setElement( IndexType i, ValueType value );
 
    /**
     * \brief Returns the value of the \e i-th element.
@@ -331,7 +331,7 @@ public:
     * \param i The index of the element to be returned.
     */
    __cuda_callable__
-   Value getElement( Index i ) const;
+   ValueType getElement( IndexType i ) const;
 
    /**
     * \brief Accesses the \e i-th element of the array view.
@@ -349,7 +349,7 @@ public:
     * \return Reference to the \e i-th element.
     */
    __cuda_callable__
-   Value& operator[]( Index i );
+   Value& operator[]( IndexType i );
 
    /**
     * \brief Accesses the \e i-th element of the array view.
@@ -367,7 +367,7 @@ public:
     * \return Constant reference to the \e i-th element.
     */
    __cuda_callable__
-   const Value& operator[]( Index i ) const;
+   const Value& operator[]( IndexType i ) const;
 
    /**
     * \brief Compares the array view with another array-like container.
@@ -401,15 +401,15 @@ public:
     * \e begin or \e end is set to a non-zero value, only elements in the
     * sub-interval `[begin, end)` are set.
     *
-    * \param v The new value for the array view elements.
+    * \param value The new value for the array view elements.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
     */
-   void setValue( Value value,
-                  const Index begin = 0,
-                  Index end = 0 );
+   void setValue( ValueType value,
+                  IndexType begin = 0,
+                  IndexType end = 0 );
 
    /**
     * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
@@ -422,8 +422,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -452,8 +452,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place
     * where the array is allocated, i.e. it is efficient even on GPU.
@@ -481,8 +481,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -509,8 +509,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -545,7 +545,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -562,7 +562,7 @@ public:
    template< typename Fetch,
              typename Reduce,
              typename Result >
-   Result reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
+   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
 
    /**
     * \brief Computes reduction with array view elements on interval [ \e begin, \e end) for constant instances.
@@ -583,7 +583,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -600,7 +600,7 @@ public:
    template< typename Fetch,
              typename Reduce,
              typename Result >
-   Result reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
+   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
 
    /**
     * \brief Computes reduction with all array view elements.
@@ -619,7 +619,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -655,7 +655,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -681,17 +681,17 @@ public:
     * \e end is set to a non-zero value, only elements in the sub-interval
     * `[begin, end)` are checked.
     *
-    * \param v The value to be checked.
+    * \param value The value to be checked.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
-    * \return True if there is _at least one_ element in the sub-interval
-    *         `[begin, end)` which has the value \e v.
+    * \return `true` if there is _at least one_ element in the sub-interval
+    *         `[begin, end)` which has the value \e value.
     */
-   bool containsValue( Value value,
-                       const Index begin = 0,
-                       Index end = 0  ) const;
+   bool containsValue( ValueType value,
+                       IndexType begin = 0,
+                       IndexType end = 0 ) const;
 
    /**
     * \brief Checks if all elements have the same value \e v.
@@ -700,17 +700,17 @@ public:
     * \e end is set to a non-zero value, only elements in the sub-interval
     * `[begin, end)` are checked.
     *
-    * \param v The value to be checked.
+    * \param value The value to be checked.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
-    * \return True if there is _all_ elements in the sub-interval
-    *         `[begin, end)` have the same value \e v.
+    * \return `true` if _all_ elements in the sub-interval `[begin, end)`
+    *         have the same value \e value.
     */
-   bool containsOnlyValue( Value value,
-                           const Index begin = 0,
-                           Index end = 0  ) const;
+   bool containsOnlyValue( ValueType value,
+                           IndexType begin = 0,
+                           IndexType end = 0 ) const;
 
    /**
     * \brief Method for saving the data to a binary file \e fileName.
@@ -728,10 +728,10 @@ public:
 
 protected:
    //! Pointer to the data
-   Value* data = nullptr;
+   ValueType* data = nullptr;
 
    //! Array view size
-   Index size = 0;
+   IndexType size = 0;
 };
 
 /**
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index eeb0b1b4b6a2cfe4bc198f349b579541f60357ac..a2174242fb82b5771709a41398bb72a80d9884a9 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -32,7 +32,8 @@ template< typename Value,
           typename Index >
 __cuda_callable__
 ArrayView< Value, Device, Index >::
-ArrayView( Value* data, Index size ) : data(data), size(size)
+ArrayView( ValueType* data, IndexType size )
+: data(data), size(size)
 {
    TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
@@ -46,7 +47,7 @@ template< typename Value,
 __cuda_callable__
 void
 ArrayView< Value, Device, Index >::
-bind( Value* data, Index size )
+bind( ValueType* data, IndexType size )
 {
    TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
@@ -60,7 +61,9 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-void ArrayView< Value, Device, Index >::bind( ArrayView view )
+void
+ArrayView< Value, Device, Index >::
+bind( ArrayView view )
 {
    bind( view.getData(), view.getSize() );
 }
@@ -71,7 +74,7 @@ template< typename Value,
 __cuda_callable__
 typename ArrayView< Value, Device, Index >::ViewType
 ArrayView< Value, Device, Index >::
-getView( const IndexType begin, IndexType end )
+getView( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
@@ -84,7 +87,7 @@ template< typename Value,
 __cuda_callable__
 typename ArrayView< Value, Device, Index >::ConstViewType
 ArrayView< Value, Device, Index >::
-getConstView( const IndexType begin, IndexType end ) const
+getConstView( IndexType begin, IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
@@ -157,8 +160,8 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value* ArrayView< Value, Device, Index >::
+const Value*
+ArrayView< Value, Device, Index >::
 getData() const
 {
    return data;
@@ -179,8 +182,8 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value* ArrayView< Value, Device, Index >::
+const Value*
+ArrayView< Value, Device, Index >::
 getArrayData() const
 {
    return data;
@@ -214,7 +217,7 @@ template< typename Value,
 __cuda_callable__
 void
 ArrayView< Value, Device, Index >::
-setElement( Index i, Value value )
+setElement( IndexType i, ValueType value )
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -226,7 +229,7 @@ template< typename Value,
           typename Index >
 __cuda_callable__ Value
 ArrayView< Value, Device, Index >::
-getElement( Index i ) const
+getElement( IndexType i ) const
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -238,7 +241,7 @@ template< typename Value,
           typename Index >
 __cuda_callable__
 Value& ArrayView< Value, Device, Index >::
-operator[]( Index i )
+operator[]( IndexType i )
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -254,9 +257,9 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value& ArrayView< Value, Device, Index >::
-operator[]( Index i ) const
+const Value&
+ArrayView< Value, Device, Index >::
+operator[]( IndexType i ) const
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -302,7 +305,7 @@ template< typename Value,
           typename Index >
 void
 ArrayView< Value, Device, Index >::
-setValue( Value value, const Index begin, Index end )
+setValue( ValueType value, IndexType begin, IndexType end )
 {
    TNL_ASSERT_GT( size, 0, "Attempted to set value to an empty array view." );
    if( end == 0 )
@@ -314,14 +317,15 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
-forElements( const Index begin, Index end, Function&& f )
+void
+ArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f )
 {
    if( ! this->data )
       return;
 
    ValueType* d = this->getData();
-   auto g = [=] __cuda_callable__ ( Index i ) mutable
+   auto g = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       f( i, d[ i ] );
    };
@@ -332,14 +336,15 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
-forElements( const Index begin, Index end, Function&& f ) const
+void
+ArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
 {
    if( ! this->data )
       return;
 
    const ValueType* d = this->getData();
-   auto g = [=] __cuda_callable__ ( Index i )
+   auto g = [=] __cuda_callable__ ( IndexType i )
    {
       f( i, d[ i ] );
    };
@@ -350,7 +355,8 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
+void
+ArrayView< Value, Device, Index >::
 forAllElements( Function&& f )
 {
    this->forElements( 0, this->getSize(), f );
@@ -360,7 +366,8 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
+void
+ArrayView< Value, Device, Index >::
 forAllElements( Function&& f ) const
 {
    this->forElements( 0, this->getSize(), f );
@@ -372,8 +379,9 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
-reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
+Result
+ArrayView< Value, Device, Index >::
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    if( ! this->data )
       return zero;
@@ -389,8 +397,9 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
-reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
+Result
+ArrayView< Value, Device, Index >::
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    if( ! this->data )
       return;
@@ -406,7 +415,8 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
+Result
+ArrayView< Value, Device, Index >::
 reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
@@ -418,7 +428,8 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
+Result
+ArrayView< Value, Device, Index >::
 reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
@@ -429,9 +440,9 @@ template< typename Value,
           typename Index >
 bool
 ArrayView< Value, Device, Index >::
-containsValue( Value value,
-               const Index begin,
-               Index end ) const
+containsValue( ValueType value,
+               IndexType begin,
+               IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
@@ -443,33 +454,21 @@ template< typename Value,
           typename Index >
 bool
 ArrayView< Value, Device, Index >::
-containsOnlyValue( Value value,
-                   const Index begin,
-                   Index end  ) const
+containsOnlyValue( ValueType value,
+                   IndexType begin,
+                   IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
    return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
 }
 
-template< typename Value, typename Device, typename Index >
-std::ostream& operator<<( std::ostream& str, const ArrayView< Value, Device, Index >& view )
-{
-   str << "[ ";
-   if( view.getSize() > 0 )
-   {
-      str << view.getElement( 0 );
-      for( Index i = 1; i < view.getSize(); i++ )
-         str << ", " << view.getElement( i );
-   }
-   str << " ]";
-   return str;
-}
-
 template< typename Value,
           typename Device,
           typename Index >
-void ArrayView< Value, Device, Index >::save( const String& fileName ) const
+void
+ArrayView< Value, Device, Index >::
+save( const String& fileName ) const
 {
    File( fileName, std::ios_base::out ) << *this;
 }
@@ -484,6 +483,20 @@ load( const String& fileName )
    File( fileName, std::ios_base::in ) >> *this;
 }
 
+template< typename Value, typename Device, typename Index >
+std::ostream& operator<<( std::ostream& str, const ArrayView< Value, Device, Index >& view )
+{
+   str << "[ ";
+   if( view.getSize() > 0 )
+   {
+      str << view.getElement( 0 );
+      for( Index i = 1; i < view.getSize(); i++ )
+         str << ", " << view.getElement( i );
+   }
+   str << " ]";
+   return str;
+}
+
 // Serialization of array views into binary files.
 template< typename Value, typename Device, typename Index >
 File& operator<<( File& file, const ArrayView< Value, Device, Index > view )
diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 48de78e78dd74551e79c4279c9f73269493e33da..22e67a36e7a9c605d8290d2d4813f0712b06297e 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -391,7 +391,22 @@ public:
    void allocate()
    {
       SizesHolderType localSizes;
-      Algorithms::TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds );
+      Algorithms::staticFor< std::size_t, 0, SizesHolderType::getDimension() >(
+         [&] ( auto level ) {
+            if( SizesHolderType::template getStaticSize< level >() != 0 )
+               return;
+
+            const auto begin = localBegins.template getSize< level >();
+            const auto end = localEnds.template getSize< level >();
+            if( begin == end )
+               localSizes.template setSize< level >( globalSizes.template getSize< level >() );
+            else {
+               TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( OverlapsType{} ), "local size is less than the size of overlaps" );
+               //localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( OverlapsType{} ) );
+               localSizes.template setSize< level >( end - begin );
+            }
+         }
+      );
       localArray.setSize( localSizes );
    }
 
@@ -439,28 +454,6 @@ protected:
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
    SizesHolderType localEnds;
-
-private:
-   template< std::size_t level >
-   struct LocalSizesSetter
-   {
-      template< typename SizesHolder, typename LocalBegins >
-      static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds )
-      {
-         if( SizesHolder::template getStaticSize< level >() != 0 )
-            return;
-
-         const auto begin = localBegins.template getSize< level >();
-         const auto end = localEnds.template getSize< level >();
-         if( begin == end )
-            localSizes.template setSize< level >( globalSizes.template getSize< level >() );
-         else {
-            TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( OverlapsType{} ), "local size is less than the size of overlaps" );
-            //localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( OverlapsType{} ) );
-            localSizes.template setSize< level >( end - begin );
-         }
-      }
-   };
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
index fcdb728cff5590368f76a7ccefbfaf83dd5628cf..73cdda7a0492675c9b3afe0cfc3625ed6cab8ddd 100644
--- a/src/TNL/Containers/DistributedNDArraySynchronizer.h
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -156,7 +156,11 @@ public:
          this->mask = mask;
 
          // allocate buffers
-         Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view );
+         Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+            [&] ( auto dim ) {
+               allocateHelper< dim >( buffers, array_view );
+            }
+         );
       }
       else {
          // only bind to the actual data
@@ -239,12 +243,20 @@ protected:
    RequestsVector worker_init()
    {
       // fill send buffers
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            copyHelper< dim >( buffers, array_view, true, mask );
+         }
+      );
 
       // issue all send and receive async operations
       RequestsVector requests;
       const MPI_Comm group = array_view.getCommunicationGroup();
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group, tag_offset, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            sendHelper< dim >( buffers, requests, group, tag_offset, mask );
+         }
+      );
 
       return requests;
    }
@@ -252,170 +264,164 @@ protected:
    void worker_finish()
    {
       // copy data from receive buffers
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            copyHelper< dim >( buffers, array_view, false, mask );
+         }
+      );
    }
 
    template< std::size_t dim >
-   struct AllocateHelper
+   static void allocateHelper( Buffers& buffers, const DistributedNDArrayView& array_view )
    {
-      static void exec( Buffers& buffers, const DistributedNDArrayView& array_view )
-      {
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 ) {
-            dim_buffers.reset();
-            return;
-         }
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 ) {
+         dim_buffers.reset();
+         return;
+      }
 
-         using LocalBegins = typename DistributedNDArray::LocalBeginsType;
-         using SizesHolder = typename DistributedNDArray::SizesHolderType;
-         const LocalBegins& localBegins = array_view.getLocalBegins();
-         const SizesHolder& localEnds = array_view.getLocalEnds();
+      using LocalBegins = typename DistributedNDArray::LocalBeginsType;
+      using SizesHolder = typename DistributedNDArray::SizesHolderType;
+      const LocalBegins& localBegins = array_view.getLocalBegins();
+      const SizesHolder& localEnds = array_view.getLocalEnds();
 
-         SizesHolder bufferSize( localEnds );
-         bufferSize.template setSize< dim >( overlap );
+      SizesHolder bufferSize( localEnds );
+      bufferSize.template setSize< dim >( overlap );
 
-         // allocate buffers
-         dim_buffers.left_send_buffer.setSize( bufferSize );
-         dim_buffers.left_recv_buffer.setSize( bufferSize );
-         dim_buffers.right_send_buffer.setSize( bufferSize );
-         dim_buffers.right_recv_buffer.setSize( bufferSize );
-
-         // bind views to the buffers
-         dim_buffers.left_send_view.bind( dim_buffers.left_send_buffer.getView() );
-         dim_buffers.left_recv_view.bind( dim_buffers.left_recv_buffer.getView() );
-         dim_buffers.right_send_view.bind( dim_buffers.right_send_buffer.getView() );
-         dim_buffers.right_recv_view.bind( dim_buffers.right_recv_buffer.getView() );
-
-         // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
-
-         // offsets for left-send
-         dim_buffers.left_send_offsets = localBegins;
-
-         // offsets for left-receive
-         dim_buffers.left_recv_offsets = localBegins;
-         dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
-
-         // offsets for right-send
-         dim_buffers.right_send_offsets = localBegins;
-         dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
-
-         // offsets for right-receive
-         dim_buffers.right_recv_offsets = localBegins;
-         dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
-
-         // FIXME: set proper neighbor IDs !!!
-         const MPI_Comm group = array_view.getCommunicationGroup();
-         const int rank = MPI::GetRank(group);
-         const int nproc = MPI::GetSize(group);
-         dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
-         dim_buffers.right_neighbor = (rank + 1) % nproc;
-      }
-   };
+      // allocate buffers
+      dim_buffers.left_send_buffer.setSize( bufferSize );
+      dim_buffers.left_recv_buffer.setSize( bufferSize );
+      dim_buffers.right_send_buffer.setSize( bufferSize );
+      dim_buffers.right_recv_buffer.setSize( bufferSize );
+
+      // bind views to the buffers
+      dim_buffers.left_send_view.bind( dim_buffers.left_send_buffer.getView() );
+      dim_buffers.left_recv_view.bind( dim_buffers.left_recv_buffer.getView() );
+      dim_buffers.right_send_view.bind( dim_buffers.right_send_buffer.getView() );
+      dim_buffers.right_recv_view.bind( dim_buffers.right_recv_buffer.getView() );
+
+      // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
+
+      // offsets for left-send
+      dim_buffers.left_send_offsets = localBegins;
+
+      // offsets for left-receive
+      dim_buffers.left_recv_offsets = localBegins;
+      dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
+
+      // offsets for right-send
+      dim_buffers.right_send_offsets = localBegins;
+      dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
+
+      // offsets for right-receive
+      dim_buffers.right_recv_offsets = localBegins;
+      dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
+
+      // FIXME: set proper neighbor IDs !!!
+      const MPI_Comm group = array_view.getCommunicationGroup();
+      const int rank = MPI::GetRank(group);
+      const int nproc = MPI::GetSize(group);
+      dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
+      dim_buffers.right_neighbor = (rank + 1) % nproc;
+   }
 
    template< std::size_t dim >
-   struct CopyHelper
+   static void copyHelper( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer, SyncDirection mask )
    {
-      static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer, SyncDirection mask )
-      {
-         // skip if there are no overlaps
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 )
-            return;
-
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
-
-         if( buffered ) {
-            // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
-            CopyKernel< decltype(dim_buffers.left_send_view) > copy_kernel;
-            copy_kernel.array_view.bind( array_view );
-            copy_kernel.to_buffer = to_buffer;
-
-            if( to_buffer ) {
-               if( mask & SyncDirection::Left ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.left_send_view );
-                  copy_kernel.array_offsets = dim_buffers.left_send_offsets;
-                  dim_buffers.left_send_view.forAll( copy_kernel );
-               }
-
-               if( mask & SyncDirection::Right ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.right_send_view );
-                  copy_kernel.array_offsets = dim_buffers.right_send_offsets;
-                  dim_buffers.right_send_view.forAll( copy_kernel );
-               }
+      // skip if there are no overlaps
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 )
+         return;
+
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+      if( buffered ) {
+         // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
+         CopyKernel< decltype(dim_buffers.left_send_view) > copy_kernel;
+         copy_kernel.array_view.bind( array_view );
+         copy_kernel.to_buffer = to_buffer;
+
+         if( to_buffer ) {
+            if( mask & SyncDirection::Left ) {
+               copy_kernel.buffer_view.bind( dim_buffers.left_send_view );
+               copy_kernel.array_offsets = dim_buffers.left_send_offsets;
+               dim_buffers.left_send_view.forAll( copy_kernel );
             }
-            else {
-               if( mask & SyncDirection::Right ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.left_recv_view );
-                  copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
-                  dim_buffers.left_recv_view.forAll( copy_kernel );
-               }
-
-               if( mask & SyncDirection::Left ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.right_recv_view );
-                  copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
-                  dim_buffers.right_recv_view.forAll( copy_kernel );
-               }
+
+            if( mask & SyncDirection::Right ) {
+               copy_kernel.buffer_view.bind( dim_buffers.right_send_view );
+               copy_kernel.array_offsets = dim_buffers.right_send_offsets;
+               dim_buffers.right_send_view.forAll( copy_kernel );
             }
          }
          else {
-            // avoid buffering - bind buffer views directly to the array
-            dim_buffers.left_send_view.bind( &call_with_offsets( dim_buffers.left_send_offsets, array_view ) );
-            dim_buffers.left_recv_view.bind( &call_with_offsets( dim_buffers.left_recv_offsets, array_view ) );
-            dim_buffers.right_send_view.bind( &call_with_offsets( dim_buffers.right_send_offsets, array_view ) );
-            dim_buffers.right_recv_view.bind( &call_with_offsets( dim_buffers.right_recv_offsets, array_view ) );
-         }
+            if( mask & SyncDirection::Right ) {
+               copy_kernel.buffer_view.bind( dim_buffers.left_recv_view );
+               copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
+               dim_buffers.left_recv_view.forAll( copy_kernel );
+            }
 
+            if( mask & SyncDirection::Left ) {
+               copy_kernel.buffer_view.bind( dim_buffers.right_recv_view );
+               copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
+               dim_buffers.right_recv_view.forAll( copy_kernel );
+            }
+         }
       }
-   };
+      else {
+         // avoid buffering - bind buffer views directly to the array
+         dim_buffers.left_send_view.bind( &call_with_offsets( dim_buffers.left_send_offsets, array_view ) );
+         dim_buffers.left_recv_view.bind( &call_with_offsets( dim_buffers.left_recv_offsets, array_view ) );
+         dim_buffers.right_send_view.bind( &call_with_offsets( dim_buffers.right_send_offsets, array_view ) );
+         dim_buffers.right_recv_view.bind( &call_with_offsets( dim_buffers.right_recv_offsets, array_view ) );
+      }
+
+   }
 
    template< std::size_t dim >
-   struct SendHelper
+   static void sendHelper( Buffers& buffers, RequestsVector& requests, MPI_Comm group, int tag_offset, SyncDirection mask )
    {
-      template< typename Requests, typename Group >
-      static void exec( Buffers& buffers, Requests& requests, Group group, int tag_offset, SyncDirection mask )
-      {
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 )
-            return;
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 )
+         return;
 
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
-         if( LBM_HACK == false ) {
-            if( mask & SyncDirection::Left ) {
-               requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
-                                               dim_buffers.left_send_view.getStorageSize(),
-                                               dim_buffers.left_neighbor, tag_offset + 0, group ) );
-               requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
-                                               dim_buffers.right_recv_view.getStorageSize(),
-                                               dim_buffers.right_neighbor, tag_offset + 0, group ) );
-            }
-            if( mask & SyncDirection::Right ) {
-               requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
-                                               dim_buffers.right_send_view.getStorageSize(),
-                                               dim_buffers.right_neighbor, tag_offset + 1, group ) );
-               requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
-                                               dim_buffers.left_recv_view.getStorageSize(),
-                                               dim_buffers.left_neighbor, tag_offset + 1, group ) );
-            }
-         }
-         else {
-            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
-                                            dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+      if( LBM_HACK == false ) {
+         if( mask & SyncDirection::Left ) {
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
+                                            dim_buffers.left_send_view.getStorageSize(),
                                             dim_buffers.left_neighbor, tag_offset + 0, group ) );
-            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                            dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
-                                            dim_buffers.left_neighbor, tag_offset + 1, group ) );
-            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                            dim_buffers.right_send_view.getStorageSize() / 27 * 9,
-                                            dim_buffers.right_neighbor, tag_offset + 1, group ) );
-            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
-                                            dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
+                                            dim_buffers.right_recv_view.getStorageSize(),
                                             dim_buffers.right_neighbor, tag_offset + 0, group ) );
          }
+         if( mask & SyncDirection::Right ) {
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
+                                            dim_buffers.right_send_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, tag_offset + 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
+                                            dim_buffers.left_recv_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, tag_offset + 1, group ) );
+         }
       }
-   };
+      else {
+         requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
+                                         dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.left_neighbor, tag_offset + 0, group ) );
+         requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                         dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.left_neighbor, tag_offset + 1, group ) );
+         requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                         dim_buffers.right_send_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.right_neighbor, tag_offset + 1, group ) );
+         requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
+                                         dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.right_neighbor, tag_offset + 0, group ) );
+      }
+   }
 
 #ifdef __NVCC__
 public:
diff --git a/src/TNL/Containers/StaticArray.h b/src/TNL/Containers/StaticArray.h
index 5702d9fe7375df94177f67d1e65cdaf3e65ffa4b..4f7f753c223fb24c75f0c08a011848d98927c451 100644
--- a/src/TNL/Containers/StaticArray.h
+++ b/src/TNL/Containers/StaticArray.h
@@ -52,7 +52,7 @@ public:
    /**
     * \brief Constructor from static array.
     *
-    * \param v[Size] input array.
+    * \param v Input array.
     */
    // Note: the template avoids ambiguity of overloaded functions with literal 0 and pointer
    // reference: https://stackoverflow.com/q/4610503
@@ -79,9 +79,9 @@ public:
    /**
     * \brief Constructor which initializes the array by copying elements from
     * \ref std::initializer_list, e.g. `{...}`.
-    * 
+    *
     * The initializer list size must larger or equal to \e Size.
-    * 
+    *
     * @param elems input initializer list
     */
    __cuda_callable__
@@ -179,9 +179,9 @@ public:
 
    /**
     * \brief Assigns an object \e v of type \e T.
-    * 
+    *
     * T can be:
-    * 
+    *
     * 1. Static linear container implementing operator[] and having the same size.
     * In this case, \e v is copied to this array elementwise.
     * 2. An object that can be converted to \e Value type. In this case all elements
@@ -211,13 +211,13 @@ public:
 
    /**
     * \brief Cast operator for changing of the \e Value type.
-    * 
+    *
     * Returns static array having \e ValueType set to \e OtherValue, i.e.
     * StaticArray< Size, OtherValue >.
-    * 
-    * \tparam OtherValue is the \e Value type of the static array the casting 
+    *
+    * \tparam OtherValue is the \e Value type of the static array the casting
     * will be performed to.
-    * 
+    *
     * \return instance of StaticArray< Size, OtherValue >
     */
    template< typename OtherValue >
@@ -265,7 +265,7 @@ std::ostream& operator<<( std::ostream& str, const StaticArray< Size, Value >& a
 
 /**
  * \brief Serialization of static arrays into binary files.
- * 
+ *
  * \param file output file
  * \param array is an array to be written into the output file.
  */
@@ -274,7 +274,7 @@ File& operator<<( File& file, const StaticArray< Size, Value >& array );
 
 /**
  * \brief Serialization of static arrays into binary files.
- * 
+ *
  * \param file output file
  * \param array is an array to be written into the output file.
  */
@@ -283,7 +283,7 @@ File& operator<<( File&& file, const StaticArray< Size, Value >& array );
 
 /**
  * \brief Deserialization of static arrays from binary files.
- * 
+ *
  * \param file input file
  * \param array is an array to be read from the input file.
  */
@@ -292,7 +292,7 @@ File& operator>>( File& file, StaticArray< Size, Value >& array );
 
 /**
  * \brief Deserialization of static arrays from binary files.
- * 
+ *
  * \param file input file
  * \param array is an array to be read from the input file.
  */
diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index c6af2e4edc7c0bed8347ac47403eb2a6644baad2..c6c18fb0b2a6bde7467e2606bc08439e15ff7b3e 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Math.h>
 #include <TNL/Containers/StaticArray.h>
 #include <TNL/Containers/detail/StaticArrayAssignment.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 namespace TNL {
 namespace Containers {
@@ -49,7 +49,7 @@ struct StaticArrayComparator< Size, LeftValue, RightValue, Size >
 ////
 // Static array sort does static loop unrolling of array sort.
 // It performs static variant of bubble sort as follows:
-// 
+//
 // for( int k = Size - 1; k > 0; k--)
 //   for( int i = 0; i < k; i++ )
 //      if( data[ i ] > data[ i+1 ] )
@@ -102,21 +102,33 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value v[ Size ] )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v[ i ];
+      }
+   );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value& v )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), v );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v;
+      }
+   );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const StaticArray< Size, Value >& v )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v.getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v[ i ];
+      }
+   );
 }
 
 template< int Size, typename Value >
@@ -228,7 +240,11 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >& StaticArray< Size, Value >::operator=( const StaticArray< Size, Value >& array )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), array.getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = array[ i ];
+      }
+   );
    return *this;
 }
 
@@ -264,7 +280,11 @@ StaticArray< Size, Value >::
 operator StaticArray< Size, OtherValue >() const
 {
    StaticArray< Size, OtherValue > aux;
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         aux[ i ] = (*this)[ i ];
+      }
+   );
    return aux;
 }
 
@@ -272,7 +292,11 @@ template< int Size, typename Value >
 __cuda_callable__
 void StaticArray< Size, Value >::setValue( const ValueType& val )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), val );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = val;
+      }
+   );
 }
 
 template< int Size, typename Value >
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index bbcf8a09f5f88edc202438cb9e323bd227e6f813..bb22eba8c03a4d352638fd0005524586bd947f2f 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -99,7 +99,11 @@ StaticVector< Size, Real >::
 operator StaticVector< Size, OtherReal >() const
 {
    StaticVector< Size, OtherReal > aux;
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), this->getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         aux[ i ] = (*this)[ i ];
+      }
+   );
    return aux;
 }
 
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 9d09ecd5045e767b89dde23c23537dd543d711ce..859e326d483441019edaa1649e69649a31bacbfe 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -22,8 +22,8 @@ namespace Containers {
  * The template parameters have the same meaning as in \ref Array, with \e Real
  * corresponding to \e Array's \e Value parameter.
  *
- * \tparam Real   An arithmetic type for the vector values, e.g. \ref float or
- *                \ref double.
+ * \tparam Real   An arithmetic type for the vector values, e.g. `float` or
+ *                `double`.
  * \tparam Device The device to be used for the execution of vector operations.
  * \tparam Index  The indexing type.
  * \tparam Allocator The type of the allocator used for the allocation and
@@ -51,7 +51,7 @@ public:
 
    /**
     * \brief Device where the vector is allocated.
-    * 
+    *
     * See \ref Devices::Host or \ref Devices::Cuda.
     */
    using DeviceType = Device;
@@ -63,7 +63,7 @@ public:
 
    /**
     * \brief Allocator type used for allocating this vector.
-    * 
+    *
     * See \ref Allocators::Cuda, \ref Allocators::CudaHost, \ref Allocators::CudaManaged, \ref Allocators::Host or \ref Allocators:Default.
     */
    using AllocatorType = Allocator;
@@ -114,7 +114,7 @@ public:
 
    /**
     * \brief Constructor from expression template
-    * 
+    *
     * @param expression input expression template
     */
    template< typename VectorExpression,
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 83ec6d0b0e0c828981a2bfcdf9e1f84cb473e219..2416b85095597398a5d25228e8ac6ee50bf37fb2 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -25,8 +25,8 @@ namespace Containers {
  * The template parameters have the same meaning as in \ref ArrayView, with
  * \e Real corresponding to \e ArrayView's \e Value parameter.
  *
- * \tparam Real   An arithmetic type for the vector values, e.g. \ref float or
- *                \ref double.
+ * \tparam Real   An arithmetic type for the vector values, e.g. `float` or
+ *                `double`.
  * \tparam Device The device to be used for the execution of vector operations.
  * \tparam Index  The indexing type.
  */
diff --git a/src/TNL/Containers/detail/StaticArrayAssignment.h b/src/TNL/Containers/detail/StaticArrayAssignment.h
index 6ba6c8e02a1ab81388203cef969eca594bef29c6..0e3f0e36667195d337a12a973820e348b4c595b9 100644
--- a/src/TNL/Containers/detail/StaticArrayAssignment.h
+++ b/src/TNL/Containers/detail/StaticArrayAssignment.h
@@ -11,32 +11,12 @@
 #pragma once
 
 #include <TNL/TypeTraits.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 namespace TNL {
 namespace Containers {
 namespace detail {
 
-struct AssignArrayFunctor
-{
-   template< typename LeftValue, typename RightValue >
-   __cuda_callable__
-   void operator()( int i, LeftValue* data, const RightValue* v ) const
-   {
-      data[ i ] = v[ i ];
-   }
-};
-
-struct AssignValueFunctor
-{
-   template< typename LeftValue, typename RightValue >
-   __cuda_callable__
-   void operator()( int i, LeftValue* data, const RightValue& v ) const
-   {
-      data[ i ] = v;
-   }
-};
-
 template< typename StaticArray,
           typename T,
           bool isStaticArrayType = IsStaticArrayType< T >::value >
@@ -49,11 +29,15 @@ template< typename StaticArray,
           typename T >
 struct StaticArrayAssignment< StaticArray, T, true >
 {
-   __cuda_callable__
-   static void assign( StaticArray& a, const T& v )
+   static constexpr void assign( StaticArray& a, const T& v )
    {
-      static_assert( StaticArray::getSize() == T::getSize(), "Cannot assign static arrays with different size." );
-      Algorithms::StaticFor< 0, StaticArray::getSize() >::exec( AssignArrayFunctor{}, a.getData(), v.getData() );
+      static_assert( StaticArray::getSize() == T::getSize(),
+                     "Cannot assign static arrays with different size." );
+      Algorithms::unrolledFor< int, 0, StaticArray::getSize() >(
+         [&] ( int i ) mutable {
+            a[ i ] = v[ i ];
+         }
+      );
    }
 };
 
@@ -65,10 +49,13 @@ template< typename StaticArray,
           typename T >
 struct StaticArrayAssignment< StaticArray, T, false >
 {
-   __cuda_callable__
-   static void assign( StaticArray& a, const T& v )
+   static constexpr void assign( StaticArray& a, const T& v )
    {
-      Algorithms::StaticFor< 0, StaticArray::getSize() >::exec( AssignValueFunctor{}, a.getData(), v );
+      Algorithms::unrolledFor< int, 0, StaticArray::getSize() >(
+         [&] ( int i ) mutable {
+            a[ i ] = v;
+         }
+      );
    }
 };
 
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 1375683b28d9a04ab8d0888f6e79274d79f62fe0..97fd8122c13cc09686013ed06c2f65670b054b26 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Cuda/CudaCallable.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
 
@@ -124,48 +124,6 @@ protected:
     }
 };
 
-template< std::size_t dimension >
-struct SizesHolderStaticSizePrinter
-{
-   template< typename SizesHolder >
-   static void exec( std::ostream& str, const SizesHolder& holder )
-   {
-      str << holder.template getStaticSize< dimension >() << ", ";
-   }
-};
-
-template< std::size_t dimension >
-struct SizesHolderSizePrinter
-{
-   template< typename SizesHolder >
-   static void exec( std::ostream& str, const SizesHolder& holder )
-   {
-      str << holder.template getSize< dimension >() << ", ";
-   }
-};
-
-template< std::size_t level >
-struct SizesHolerOperatorPlusHelper
-{
-   template< typename Result, typename LHS, typename RHS >
-   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
-   {
-      if( result.template getStaticSize< level >() == 0 )
-         result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
-   }
-};
-
-template< std::size_t level >
-struct SizesHolerOperatorMinusHelper
-{
-   template< typename Result, typename LHS, typename RHS >
-   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
-   {
-      if( result.template getStaticSize< level >() == 0 )
-         result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
-   }
-};
-
 } // namespace __ndarray_impl
 
 
@@ -231,7 +189,12 @@ SizesHolder< Index, sizes... >
 operator+( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
 {
    SizesHolder< Index, sizes... > result;
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorPlusHelper >::execHost( result, lhs, rhs );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) >(
+      [&result, &lhs, &rhs] ( auto level ) {
+         if( result.template getStaticSize< level >() == 0 )
+            result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
+      }
+   );
    return result;
 }
 
@@ -242,7 +205,12 @@ SizesHolder< Index, sizes... >
 operator-( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
 {
    SizesHolder< Index, sizes... > result;
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorMinusHelper >::execHost( result, lhs, rhs );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) >(
+      [&result, &lhs, &rhs] ( auto level ) {
+         if( result.template getStaticSize< level >() == 0 )
+            result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
+      }
+   );
    return result;
 }
 
@@ -295,9 +263,17 @@ template< typename Index,
 std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
 {
    str << "SizesHolder< ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getStaticSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
 }
@@ -360,10 +336,18 @@ template< typename Index,
 std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... >, ConstValue >& holder )
 {
    str << "LocalBeginsHolder< SizesHolder< ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getStaticSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >, ";
    str << ConstValue << " >( ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
 }
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index 4e5473c701389a25c1386778a8eab95dc5d77fe0..fc835d13f4c102b3b14b417394cfd42265bf7977 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -15,7 +15,7 @@
 #include <algorithm>
 
 #include <TNL/Assert.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Containers/ndarray/Meta.h>
 
 namespace TNL {
@@ -209,18 +209,6 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 >
 };
 
 
-template< std::size_t level >
-struct WeakCompareHelper
-{
-   template< typename SizesHolder1,
-             typename SizesHolder2 >
-   __cuda_callable__
-   static void exec( const SizesHolder1& sizes1, const SizesHolder2& sizes2, bool& result )
-   {
-      result &= sizes1.template getSize< level >() == sizes2.template getSize< level >();
-   }
-};
-
 // helper for the assignment operator in NDArrayView
 template< typename SizesHolder1,
           typename SizesHolder2 >
@@ -230,7 +218,11 @@ bool sizesWeakCompare( const SizesHolder1& sizes1, const SizesHolder2& sizes2 )
    static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
                   "Cannot compare sizes of different dimensions." );
    bool result = true;
-   Algorithms::TemplateStaticFor< std::size_t, 0, SizesHolder1::getDimension(), WeakCompareHelper >::exec( sizes1, sizes2, result );
+   Algorithms::staticFor< std::size_t, 0, SizesHolder1::getDimension() >(
+      [&result, &sizes1, &sizes2] ( auto level ) {
+         result = result && sizes1.template getSize< level >() == sizes2.template getSize< level >();
+      }
+   );
    return result;
 }
 
diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h
index b9ec101cf60d2bb3d266cb92d07bfbccac1eb28f..66c585252e8fbbce05f856dbbabf8549a84a33b2 100644
--- a/src/TNL/Functions/CutMeshFunction.h
+++ b/src/TNL/Functions/CutMeshFunction.h
@@ -10,7 +10,6 @@
 
 #pragma once
 
-#include <TNL/Algorithms/StaticVectorFor.h>
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
@@ -22,6 +21,38 @@ template <  typename MeshFunctionType,
             int codimension=MeshFunctionType::getMeshDimension()-OutMesh::getMeshDimension()>
 class CutMeshFunction
 {
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs,
+             int dim >
+   static void
+   staticVectorFor( const Containers::StaticVector< dim, Index >& begin,
+                    const Containers::StaticVector< dim, Index >& end,
+                    Function f,
+                    FunctionArgs... args )
+   {
+      static_assert( 1 <= dim && dim <= 3, "unsupported dimension" );
+      Containers::StaticVector< dim, Index > index;
+
+      if( dim == 1 ) {
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+
+      if( dim == 2 ) {
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+               f( index, args... );
+      }
+
+      if( dim == 3 ) {
+         for( index[2] = begin[2]; index[2] < end[2]; index[2]++ )
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+   }
+
   public:
     static bool Cut(MeshFunctionType &inputMeshFunction,
                     OutMesh &outMesh,
@@ -99,7 +130,7 @@ class CutMeshFunction
 
             typename OutMesh::CoordinatesType starts;
             starts.setValue(0);
-            Algorithms::StaticVectorFor::exec(starts,outMesh.getDimensions(),kernel);
+            staticVectorFor(starts,outMesh.getDimensions(),kernel);
         }
 
         return inCut;
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
index 840a201c6021448e4f0de99552c4918364f92874..ba619bc927e98c48ddb86c469fbde302646f745f 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +36,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -46,12 +46,12 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -65,7 +65,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -79,10 +79,10 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
@@ -107,7 +107,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -117,14 +117,14 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       static constexpr int stencilSize = Config::getStencilSize();
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -138,7 +138,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntityType( this->entity.getMesh(), CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -157,33 +157,25 @@ class NeighborGridEntityGetter<
 #else
          return this->entity.getIndex() + step;
 #endif
- 
+
       }
- 
-      template< IndexType index >
-      class StencilRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencil[ index + stencilSize ] = entityIndex + index;
-            }
-      };
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA  // TODO: fix it -- does not work with nvcc
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilRefresher >::exec( *this, entityIndex );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >(
+            [&] ( auto index ) {
+               stencil[ index + stencilSize ] = entityIndex + index;
+            }
+         );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencil[ 2 * stencilSize + 1 ];
 };
 
@@ -204,7 +196,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -214,12 +206,12 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -233,7 +225,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step + ( step < 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -247,16 +239,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step + ( step < 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 /****
@@ -277,7 +269,7 @@ class NeighborGridEntityGetter<
    StencilStorage > //GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -287,14 +279,14 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       void test() const { std::cerr << "***" << std::endl; };
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -308,7 +300,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step - ( step > 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -322,7 +314,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step - ( step > 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -348,7 +340,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -358,13 +350,13 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
 
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -378,7 +370,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step - ( step > 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -392,7 +384,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step - ( step > 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -419,7 +411,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -434,7 +426,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -448,7 +440,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -463,11 +455,11 @@ class NeighborGridEntityGetter<
 
          return this->entity.getIndex() + step;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
- 
+
    protected:
 
       const GridEntityType& entity;
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
index 1591a523e0e9785d535ec535e1b0d6449d3c88ea..00286663afd5be1f31a46f7303e82b739b24373c 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
@@ -14,6 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +37,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -51,7 +52,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -67,7 +68,7 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX,
                                                           entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -81,14 +82,14 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepY * entity.getMesh().getDimensions().x() + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -109,7 +110,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -120,14 +121,14 @@ class NeighborGridEntityGetter<
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
       typedef GridEntityStencilStorageTag< GridEntityCrossStencil > StencilStorage;
- 
+
       static constexpr int stencilSize = Config::getStencilSize();
 
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -143,7 +144,7 @@ class NeighborGridEntityGetter<
                                             CoordinatesType( entity.getCoordinates().x() + stepX,
                                                              entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -166,52 +167,33 @@ class NeighborGridEntityGetter<
 #else
          return this->entity.getIndex() + stepY * entity.getMesh().getDimensions().x() + stepX;
 #endif
- 
-      }
- 
-      template< IndexType index >
-      class StencilXRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilX[ index + stencilSize ] = entityIndex + index;
-            }
-      };
 
-      template< IndexType index >
-      class StencilYRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilY[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().getDimensions().x();
-            }
-      };
+      }
 
- 
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA // TODO: fix this to work with CUDA
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilXRefresher >::exec( *this, entityIndex );
+         auto stencilXRefresher = [&] ( auto index ) {
+            stencilX[ index + stencilSize ] = entityIndex + index;
+         };
+         auto stencilYRefresher = [&] ( auto index ) {
+            stencilY[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().getDimensions().x();
+         };
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >( stencilXRefresher );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencilX[ 2 * stencilSize + 1 ];
       IndexType stencilY[ 2 * stencilSize + 1 ];
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -233,7 +215,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -250,7 +232,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -276,14 +258,14 @@ class NeighborGridEntityGetter<
                                                                 stepY ? (stepY > 0 ? 1 : -1) : 0 ),
                                          EntityBasisType( ! stepX, ! stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -310,7 +292,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -325,7 +307,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -348,21 +330,21 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX + ( stepX < 0 ),
                                                           entity.getCoordinates().y() + stepY + ( stepY < 0 ) ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -384,7 +366,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -399,7 +381,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -424,17 +406,17 @@ class NeighborGridEntityGetter<
                      CoordinatesType( entity.getCoordinates().x() + stepX - ( stepX > 0 ) * ( entity.getOrientation().x() != 0.0 ),
                                       entity.getCoordinates().y() + stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ) ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
@@ -458,7 +440,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -473,7 +455,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -489,7 +471,7 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX,
                                                           entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -503,14 +485,14 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepY * ( entity.getMesh().getDimensions().x() + 1 ) + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
index 7c260f5779f54c3903cc97cad2ccbc5dab157477..1a377490baab7638f882d13b30af2adcda2ba1c2 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +36,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -51,7 +51,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -68,7 +68,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -83,16 +83,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + ( stepZ * entity.getMesh().getDimensions().y() + stepY ) * entity.getMesh().getDimensions().x() + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 
@@ -113,7 +113,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -126,12 +126,12 @@ class NeighborGridEntityGetter<
       typedef GridEntityStencilStorageTag< GridEntityCrossStencil > StencilStorage;
 
       static constexpr int stencilSize = Config::getStencilSize();
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -148,7 +148,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -179,66 +179,38 @@ class NeighborGridEntityGetter<
 #endif
 
       }
- 
-      template< IndexType index >
-      class StencilXRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilX[ index + stencilSize ] = entityIndex + index;
-            }
-      };
-
-      template< IndexType index >
-      class StencilYRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilY[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().getDimensions().x();
-            }
-      };
- 
-      template< IndexType index >
-      class StencilZRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilZ[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().cellZNeighborsStep;
-            }
-      };
 
- 
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA // TODO: fix this to work with CUDA
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilZRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilZRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilXRefresher >::exec( *this, entityIndex );
+         auto stencilXRefresher = [&] ( auto index ) {
+            stencilX[ index + stencilSize ] = entityIndex + index;
+         };
+         auto stencilYRefresher = [&] ( auto index ) {
+            stencilY[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().getDimensions().x();
+         };
+         auto stencilZRefresher = [&] ( auto index ) {
+            stencilZ[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().cellZNeighborsStep;
+         };
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilZRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilZRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >( stencilXRefresher );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencilX[ 2 * stencilSize + 1 ];
       IndexType stencilY[ 2 * stencilSize + 1 ];
       IndexType stencilZ[ 2 * stencilSize + 1 ];
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -259,7 +231,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -276,7 +248,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -307,21 +279,21 @@ class NeighborGridEntityGetter<
                                                                stepZ ? (stepZ > 0 ? 1 : -1) : 0 ),
                                         EntityBasisType( ! stepX, !stepY, !stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -342,7 +314,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -359,7 +331,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -390,21 +362,21 @@ class NeighborGridEntityGetter<
                                                                stepZ ? (stepZ > 0 ? 1 : -1) : 0 ),
                                         EntityBasisType( ! stepX, !stepY, !stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -426,7 +398,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -443,7 +415,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -472,21 +444,21 @@ class NeighborGridEntityGetter<
                                         EntityOrientationType( !!stepX, !!stepY, !!stepZ ),
                                         EntityBasisType( !stepX, !stepY, !stepZ ));
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -508,7 +480,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -523,7 +495,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY,int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -554,21 +526,21 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY + ( stepY < 0 ),
                                                          entity.getCoordinates().z() + stepZ + ( stepZ < 0 ) ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -589,7 +561,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -604,7 +576,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -637,21 +609,21 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ),
                                                          entity.getCoordinates().z() + stepZ - ( stepZ > 0 ) * ( entity.getOrientation().z() != 0.0 ) ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -672,7 +644,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -687,7 +659,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -705,7 +677,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -720,16 +692,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepZ * ( entity.getMesh().getDimensions().y() + 1 + stepY ) * ( entity.getMesh().getDimensions().x() + 1 ) + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 } // namespace Meshes
diff --git a/src/TNL/Meshes/Mesh.h b/src/TNL/Meshes/Mesh.h
index fa5cfeb58e028276c3e730b6684bae5bba50e1d4..41e24412b65b0e8a004be28b3505ddfe62587c67 100644
--- a/src/TNL/Meshes/Mesh.h
+++ b/src/TNL/Meshes/Mesh.h
@@ -202,7 +202,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forAll( Func f ) const;
@@ -213,7 +213,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forBoundary( Func f ) const;
@@ -224,7 +224,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forInterior( Func f ) const;
@@ -235,7 +235,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forLocal( Func f ) const;
@@ -246,7 +246,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forGhost( Func f ) const;
diff --git a/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h b/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
index 71e9fab0eb2a691372af6adc0b5d93c3ef415889..9195a2e2a89c32daaa53674a80bb8edd6f393fac 100644
--- a/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
+++ b/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
@@ -28,7 +28,7 @@ private:
                 Mesh::MeshTraitsType::template SubentityTraits< typename Mesh::template EntityType< Dimension >::EntityTopology,
                                                                 Subdimension >::storageEnabled
              >
-   struct _SubentitiesStorageWorker
+   struct SubentitiesStorageWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& perm )
       {
@@ -38,7 +38,7 @@ private:
    };
 
    template< int Subdimension >
-   struct _SubentitiesStorageWorker< Subdimension, false >
+   struct SubentitiesStorageWorker< Subdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -49,7 +49,7 @@ private:
                 Mesh::MeshTraitsType::template SuperentityTraits< typename Mesh::template EntityType< Dimension >::EntityTopology,
                                                                   Superdimension >::storageEnabled
              >
-   struct _SuperentitiesStorageWorker
+   struct SuperentitiesStorageWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& perm )
       {
@@ -60,7 +60,7 @@ private:
    };
 
    template< int Superdimension >
-   struct _SuperentitiesStorageWorker< Superdimension, false >
+   struct SuperentitiesStorageWorker< Superdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -71,7 +71,7 @@ private:
                 Mesh::MeshTraitsType::template SuperentityTraits< typename Mesh::template EntityType< Subdimension >::EntityTopology,
                                                                   Dimension >::storageEnabled
              >
-   struct IndexPermutationApplierSubentitiesWorker
+   struct SubentitiesWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm )
       {
@@ -81,7 +81,7 @@ private:
    };
 
    template< int Subdimension >
-   struct IndexPermutationApplierSubentitiesWorker< Subdimension, false >
+   struct SubentitiesWorker< Subdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -92,7 +92,7 @@ private:
                 Mesh::MeshTraitsType::template SubentityTraits< typename Mesh::template EntityType< Superdimension >::EntityTopology,
                                                                 Dimension >::storageEnabled
              >
-   struct IndexPermutationApplierSuperentitiesWorker
+   struct SuperentitiesWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm )
       {
@@ -102,25 +102,12 @@ private:
    };
 
    template< int Superdimension >
-   struct IndexPermutationApplierSuperentitiesWorker< Superdimension, false >
+   struct SuperentitiesWorker< Superdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
 
 
-   // template aliases needed to hide the 'Enabled' parameter
-   template< int Subdimension >
-   using SubentitiesStorageWorker = _SubentitiesStorageWorker< Subdimension >;
-
-   template< int Superdimension >
-   using SuperentitiesStorageWorker = _SuperentitiesStorageWorker< Superdimension >;
-
-   template< int Subdimension >
-   using SubentitiesWorker = IndexPermutationApplierSubentitiesWorker< Subdimension >;
-
-   template< int Superdimension >
-   using SuperentitiesWorker = IndexPermutationApplierSuperentitiesWorker< Superdimension >;
-
    template< typename Mesh_, std::enable_if_t< Mesh_::Config::dualGraphStorage(), bool > = true >
    static void permuteDualGraph( Mesh_& mesh, const GlobalIndexArray& perm, const GlobalIndexArray& iperm )
    {
@@ -183,17 +170,33 @@ public:
       if( Dimension == 0 )
          permuteArray( mesh.getPoints(), perm );
 
-      // permute superentities storage
-      Algorithms::TemplateStaticFor< int, 0, Dimension, SubentitiesStorageWorker >::execHost( mesh, perm );
-
       // permute subentities storage
-      Algorithms::TemplateStaticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1, SuperentitiesStorageWorker >::execHost( mesh, perm );
+      Algorithms::staticFor< int, 0, Dimension >(
+         [&] ( auto dim ) {
+            SubentitiesStorageWorker< dim >::exec( mesh, perm );
+         }
+      );
+
+      // permute superentities storage
+      Algorithms::staticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1 >(
+         [&] ( auto dim ) {
+            SuperentitiesStorageWorker< dim >::exec( mesh, perm );
+         }
+      );
 
       // update superentity indices from the subentities
-      Algorithms::TemplateStaticFor< int, 0, Dimension, SubentitiesWorker >::execHost( mesh, iperm );
+      Algorithms::staticFor< int, 0, Dimension >(
+         [&] ( auto dim ) {
+            SubentitiesWorker< dim >::exec( mesh, iperm );
+         }
+      );
 
       // update subentity indices from the superentities
-      Algorithms::TemplateStaticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1, SuperentitiesWorker >::execHost( mesh, iperm );
+      Algorithms::staticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1 >(
+         [&] ( auto dim ) {
+            SuperentitiesWorker< dim >::exec( mesh, iperm );
+         }
+      );
 
       if( Dimension == Mesh::getMeshDimension() ) {
          // permute dual graph
diff --git a/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h b/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
index 1e2edb332325921132a5050d166190d394db24d5..52dcc73c71aeb3eeff3789bf66173018af81d9ac 100644
--- a/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
+++ b/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Meshes/MeshDetails/traits/MeshTraits.h>
 
 namespace TNL {
@@ -43,35 +43,20 @@ public:
    static SubentitySeedArray create( const SubvertexAccessorType& subvertices )
    {
       SubentitySeedArray subentitySeeds;
-      Algorithms::TemplateStaticFor< LocalIndexType, 0, SubentitySeedArray::getSize(), CreateSubentitySeeds >::execHost( subentitySeeds, subvertices );
+      Algorithms::staticFor< LocalIndexType, 0, SubentitySeedArray::getSize() >(
+         [&] ( auto subentityIndex ) {
+            Algorithms::staticFor< LocalIndexType, 0, SUBENTITY_VERTICES_COUNT >(
+               [&] ( auto subentityVertexIndex ) {
+                  // subentityIndex cannot be captured as constexpr, so we need to create another instance of its type
+                  static constexpr LocalIndexType VERTEX_INDEX = SubentityTraits::template Vertex< decltype(subentityIndex){}, subentityVertexIndex >::index;
+                  subentitySeeds[ subentityIndex ].setCornerId( subentityVertexIndex, subvertices.getColumnIndex( VERTEX_INDEX ) );
+               }
+            );
+         }
+      );
 
       return subentitySeeds;
    }
-
-private:
-   using SubentitySeed = EntitySeed< MeshConfig, SubentityTopology >;
-
-   template< LocalIndexType subentityIndex >
-   class CreateSubentitySeeds
-   {
-      public:
-         static void exec( SubentitySeedArray& subentitySeeds, const SubvertexAccessorType& subvertices )
-         {
-            Algorithms::TemplateStaticFor< LocalIndexType, 0, SUBENTITY_VERTICES_COUNT, SetSubentitySeedVertex >::execHost( subentitySeeds[ subentityIndex ], subvertices );
-         }
-
-      private:
-         template< LocalIndexType subentityVertexIndex >
-         class SetSubentitySeedVertex
-         {
-            public:
-               static void exec( SubentitySeed& subentitySeed, const SubvertexAccessorType& subvertices )
-               {
-                  static constexpr LocalIndexType VERTEX_INDEX = SubentityTraits::template Vertex< subentityIndex, subentityVertexIndex >::index;
-                  subentitySeed.setCornerId( subentityVertexIndex, subvertices.getColumnIndex( VERTEX_INDEX ) );
-               }
-         };
-   };
 };
 
 template< typename MeshConfig,
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
index 95e29182ea3fd3ebfb485da142784225886e4f10..55d55890ff8d08c64d2626427e884666826ee096 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Meshes/DimensionTag.h>
@@ -47,15 +47,6 @@ protected:
       static constexpr bool value = MeshConfig::entityTagsStorage( EntityTopology() );
    };
 
-   template< int Dimension >
-   struct SetEntitiesCount
-   {
-      static void exec( Mesh& mesh )
-      {
-         mesh.template entityTagsSetEntitiesCount< Dimension >( mesh.template getEntitiesCount< Dimension >() );
-      }
-   };
-
    template< int Dimension >
    class ResetEntityTags
    {
@@ -123,15 +114,6 @@ protected:
       }
    };
 
-   template< int Dimension >
-   struct UpdateEntityTagsLayer
-   {
-      static void exec( Mesh& mesh )
-      {
-         mesh.template updateEntityTagsLayer< Dimension >();
-      }
-   };
-
 // nvcc does not allow __cuda_callable__ lambdas inside private or protected sections
 #ifdef __NVCC__
 public:
@@ -144,8 +126,19 @@ public:
    public:
       static void exec( Mesh& mesh )
       {
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, SetEntitiesCount >::execHost( mesh );
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, ResetEntityTags >::execHost( mesh );
+         // set entities count
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               mesh.template entityTagsSetEntitiesCount< dim >( mesh.template getEntitiesCount< dim >() );
+            }
+         );
+
+         // reset entity tags
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               ResetEntityTags< dim >::exec( mesh );
+            }
+         );
 
          auto kernel = [] __cuda_callable__
             ( GlobalIndexType faceIndex,
@@ -159,7 +152,11 @@ public:
                const GlobalIndexType cellIndex = face.template getSuperentityIndex< Mesh::getMeshDimension() >( 0 );
                mesh->template addEntityTag< Mesh::getMeshDimension() >( cellIndex, EntityTags::BoundaryEntity );
                // initialize all subentities
-               Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() - 1, InitializeSubentities >::exec( *mesh, faceIndex, face );
+               Algorithms::staticFor< int, 0, Mesh::getMeshDimension() - 1 >(
+                  [&mesh, faceIndex, &face] ( auto dim ) {
+                     InitializeSubentities< dim >::exec( *mesh, faceIndex, face );
+                  }
+               );
             }
          };
 
@@ -169,7 +166,12 @@ public:
                                                       kernel,
                                                       &meshPointer.template modifyData< DeviceType >() );
 
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, UpdateEntityTagsLayer >::execHost( mesh );
+         // update entity tags
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               mesh.template updateEntityTagsLayer< dim >();
+            }
+         );
       }
    };
 
diff --git a/src/TNL/Meshes/MeshEntity.h b/src/TNL/Meshes/MeshEntity.h
index b077ed04544f2d49b8b042680d698d9b92b64c09..cc55db2af4192f3dc271e3923f9941d7f1e4f42f 100644
--- a/src/TNL/Meshes/MeshEntity.h
+++ b/src/TNL/Meshes/MeshEntity.h
@@ -62,11 +62,20 @@ class MeshEntity
       __cuda_callable__
       bool operator!=( const MeshEntity& entity ) const;
 
+      /**
+       * \brief Returns the dimension of this mesh entity.
+       */
       static constexpr int getEntityDimension();
 
+      /**
+       * \brief Returns a reference to the mesh that owns this mesh entity.
+       */
       __cuda_callable__
       const MeshType& getMesh() const;
 
+      /**
+       * \brief Returns the index of this mesh entity.
+       */
       __cuda_callable__
       GlobalIndexType getIndex() const;
 
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 30ea96b4da4a0a3fa41e3c031e4e14942833c781..1e4361f4931c3213cb5dd36c9bf23a5796f84df8 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -4,6 +4,8 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
+         staticForTest
+         unrolledForTest
 )
 
 set( CPP_TESTS )
diff --git a/src/UnitTests/Algorithms/ParallelForTest.h b/src/UnitTests/Algorithms/ParallelForTest.h
index aa75fd56093df72bb83b126fde7b3f77e363aa66..fe07247d2ac24b3ed497c0666032180545935d67 100644
--- a/src/UnitTests/Algorithms/ParallelForTest.h
+++ b/src/UnitTests/Algorithms/ParallelForTest.h
@@ -8,6 +8,8 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#pragma once
+
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Array.h>
@@ -164,7 +166,7 @@ void test_1D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
@@ -200,7 +202,7 @@ void test_2D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -213,7 +215,7 @@ void test_2D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
@@ -249,7 +251,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -262,7 +264,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -275,7 +277,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
diff --git a/src/UnitTests/Algorithms/staticForTest.cpp b/src/UnitTests/Algorithms/staticForTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c40d3d9b9eb2ebc3b9b4347b1450927f124dfa6e
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.cpp
@@ -0,0 +1 @@
+#include "staticForTest.h"
diff --git a/src/UnitTests/Algorithms/staticForTest.cu b/src/UnitTests/Algorithms/staticForTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c40d3d9b9eb2ebc3b9b4347b1450927f124dfa6e
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.cu
@@ -0,0 +1 @@
+#include "staticForTest.h"
diff --git a/src/UnitTests/Algorithms/staticForTest.h b/src/UnitTests/Algorithms/staticForTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a44f65e2c01d00e71544ff39f32ee3886f00703
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.h
@@ -0,0 +1,158 @@
+/***************************************************************************
+                          staticForTest.h  -  description
+                             -------------------
+    begin                : Apr 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <array>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/staticFor.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+
+#ifdef HAVE_GTEST
+TEST( staticForTest, host_dynamic )
+{
+   constexpr int N = 5;
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   staticFor< int, 0, N >(
+      [&a] ( auto i ) {
+         a[ i ] += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( staticForTest, host_static )
+{
+   constexpr int N = 5;
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   staticFor< int, 0, N >(
+      [&a] ( auto i ) {
+         std::get< i >( a ) += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( staticForTest, host_empty )
+{
+   bool called = false;
+
+   staticFor< int, 0, 0 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+
+   staticFor< int, 0, -1 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+}
+
+#ifdef HAVE_CUDA
+// nvcc does not allow __cuda_callable__ lambdas inside private regions
+void test_cuda_dynamic()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   constexpr int N = 5;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      staticFor< int, 0, N >(
+         [&view] ( auto i ) {
+            view[ i ] += 1;
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( staticForTest, cuda_dynamic )
+{
+   test_cuda_dynamic();
+}
+
+template< int i, typename View >
+__cuda_callable__
+void static_helper( View& view )
+{
+   view[ i ] += 1;
+}
+
+// nvcc does not allow __cuda_callable__ lambdas inside private regions
+void test_cuda_static()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   constexpr int N = 5;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      staticFor< int, 0, N >(
+         [&view] ( auto i ) {
+            static_helper< i >( view );
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( staticForTest, cuda_static )
+{
+   test_cuda_static();
+}
+#endif
+#endif
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/unrolledForTest.cpp b/src/UnitTests/Algorithms/unrolledForTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81f9042e52f658e91f1a4e0aa678873e9e6257a3
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.cpp
@@ -0,0 +1 @@
+#include "unrolledForTest.h"
diff --git a/src/UnitTests/Algorithms/unrolledForTest.cu b/src/UnitTests/Algorithms/unrolledForTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81f9042e52f658e91f1a4e0aa678873e9e6257a3
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.cu
@@ -0,0 +1 @@
+#include "unrolledForTest.h"
diff --git a/src/UnitTests/Algorithms/unrolledForTest.h b/src/UnitTests/Algorithms/unrolledForTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda0e902418f074107ea104e7bebac3a6b84ca56
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.h
@@ -0,0 +1,124 @@
+/***************************************************************************
+                          unrolledForTest.h  -  description
+                             -------------------
+    begin                : Apr 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <array>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+
+#ifdef HAVE_GTEST
+template< int N >
+void test_host()
+{
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   unrolledFor< int, 0, N >(
+      [&a] ( auto i ) {
+         a[ i ] += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( unrolledForTest, host_size_8 )
+{
+   test_host<8>();
+}
+
+TEST( unrolledForTest, host_size_97 )
+{
+   test_host<97>();
+}
+
+TEST( unrolledForTest, host_size_5000 )
+{
+   test_host<5000>();
+}
+
+TEST( unrolledForTest, host_empty )
+{
+   bool called = false;
+
+   unrolledFor< int, 0, 0 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+
+   unrolledFor< int, 0, -1 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+}
+
+#ifdef HAVE_CUDA
+template< int N >
+void test_cuda()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      unrolledFor< int, 0, N >(
+         [&view] ( auto i ) {
+            view[ i ] += 1;
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( unrolledForTest, cuda_size_8 )
+{
+   test_cuda<8>();
+}
+
+TEST( unrolledForTest, cuda_size_97 )
+{
+   test_cuda<97>();
+}
+
+TEST( unrolledForTest, cuda_size_5000 )
+{
+   test_cuda<5000>();
+}
+#endif
+#endif
+
+#include "../main.h"
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index a90f09506d083db52e4b45ded4c0a49485d9d7e2..8dc9d6d26c2553039ba34474d4adb6a990cc8bd5 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -151,12 +151,12 @@ TYPED_TEST( DistributedVectorTest, scan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -164,7 +164,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -172,7 +172,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -181,7 +181,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -189,7 +189,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -197,12 +197,12 @@ TYPED_TEST( DistributedVectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -270,12 +270,12 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -283,7 +283,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -291,7 +291,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -300,7 +300,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -308,7 +308,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -316,12 +316,12 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 7f2151c5ef15429d549020d076e4fd99681c3b8f..3c52e9eeff04af5d4bb26ea04edf2231b63f7546 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -83,12 +83,12 @@ TYPED_TEST( VectorTest, scan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -96,7 +96,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
@@ -104,7 +104,7 @@ TYPED_TEST( VectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -113,7 +113,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -121,7 +121,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
@@ -129,12 +129,12 @@ TYPED_TEST( VectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -206,12 +206,12 @@ TYPED_TEST( VectorTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -219,7 +219,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
@@ -227,7 +227,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -236,7 +236,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -244,7 +244,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
@@ -252,12 +252,12 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Meshes/EntityTests.h b/src/UnitTests/Meshes/EntityTests.h
new file mode 100644
index 0000000000000000000000000000000000000000..109e6d5d99f05090d2e29470f2f0a5d469596813
--- /dev/null
+++ b/src/UnitTests/Meshes/EntityTests.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Algorithms/staticFor.h>
+#include <TNL/Meshes/MeshEntity.h>
+
+namespace EntityTests {
+
+template< typename MeshEntity >
+void testVertex( const MeshEntity& entity )
+{}
+
+template< typename MeshConfig, typename Device >
+void testVertex( const TNL::Meshes::MeshEntity< MeshConfig, Device, TNL::Meshes::Topologies::Vertex >& entity )
+{
+   EXPECT_EQ( entity.getPoint(), entity.getMesh().getPoint( entity.getIndex() ) );
+}
+
+template< int subdimension, typename MeshEntity >
+void testSubentities( const MeshEntity& entity )
+{
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   const auto meshSubentitiesCount = mesh.template getSubentitiesCount< dimension, subdimension >( index );
+   ASSERT_EQ( entity.template getSubentitiesCount< subdimension >(), meshSubentitiesCount );
+   for( int i = 0; i < entity.template getSubentitiesCount< subdimension >(); i++ ) {
+      const auto meshSubentityIndex = mesh.template getSubentityIndex< dimension, subdimension >( index, i );
+      EXPECT_EQ( entity.template getSubentityIndex< subdimension >( i ), meshSubentityIndex );
+   }
+}
+
+template< int superdimension, typename MeshEntity >
+void testSuperentities( const MeshEntity& entity )
+{
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   const auto meshSuperentitiesCount = mesh.template getSuperentitiesCount< dimension, superdimension >( index );
+   ASSERT_EQ( entity.template getSuperentitiesCount< superdimension >(), meshSuperentitiesCount );
+   for( int i = 0; i < entity.template getSuperentitiesCount< superdimension >(); i++ ) {
+      const auto meshSuperentityIndex = mesh.template getSuperentityIndex< dimension, superdimension >( index, i );
+      EXPECT_EQ( entity.template getSuperentityIndex< superdimension >( i ), meshSuperentityIndex );
+   }
+}
+
+// test if the entity is consistent with its mesh (i.e. all member functions like
+// getSubentityIndex return the same value when called from the entity and the mesh)
+template< typename MeshEntity >
+void testEntity( const MeshEntity& entity )
+{
+   // static tests for the MeshEntity type
+   static_assert( std::is_constructible< MeshEntity, typename MeshEntity::MeshType, typename MeshEntity::GlobalIndexType >::value,
+                  "MeshEntity should be constructible from its MeshType and GlobalIndexType" );
+   static_assert( ! std::is_default_constructible< MeshEntity >::value,
+                  "MeshEntity should not be default-constructible" );
+   static_assert( std::is_copy_constructible< MeshEntity >::value,
+                  "MeshEntity should be copy-constructible" );
+   static_assert( std::is_move_constructible< MeshEntity >::value,
+                  "MeshEntity should be move-constructible" );
+   static_assert( std::is_copy_assignable< MeshEntity >::value,
+                  "MeshEntity should be copy-assignable" );
+   static_assert( std::is_move_assignable< MeshEntity >::value,
+                  "MeshEntity should be move-assignable" );
+   static_assert( std::is_trivially_destructible< MeshEntity >::value,
+                  "MeshEntity should be trivially destructible" );
+
+   // dynamic tests for the entity
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   testVertex( entity );
+   EXPECT_EQ( entity.getTag(), mesh.template getEntityTag< dimension >( index ) );
+
+   TNL::Algorithms::staticFor< int, 0, dimension >(
+      [&entity] ( auto subdimension ) {
+         testSubentities< subdimension >( entity );
+      }
+   );
+   TNL::Algorithms::staticFor< int, dimension + 1, MeshEntity::MeshType::getMeshDimension() + 1 >(
+      [&entity] ( auto superdimension ) {
+         testSuperentities< superdimension >( entity );
+      }
+   );
+}
+
+template< int Dimension, typename Mesh >
+void testEntities( const Mesh& mesh )
+{
+   using Index = typename Mesh::GlobalIndexType;
+   const Index entitiesCount = mesh.template getEntitiesCount< Dimension >();
+   for( Index i = 0; i < entitiesCount; i++ ) {
+      const auto entity = mesh.template getEntity< Dimension >( i );
+      testEntity( entity );
+   }
+}
+
+} // EntityTests
+
+template< typename Mesh >
+void testEntities( const Mesh& mesh )
+{
+   TNL::Algorithms::staticFor< int, 0, Mesh::getMeshDimension() >(
+      [&mesh] ( auto Dimension ) {
+         EntityTests::testEntities< Dimension >( mesh );
+      }
+   );
+}
+#endif
diff --git a/src/UnitTests/Meshes/MeshTest.h b/src/UnitTests/Meshes/MeshTest.h
index e90161b9617ae7b279095dd047fbced833406555..788c2172a1c16c8dccbd75f6f2ee561cc3ea3522 100644
--- a/src/UnitTests/Meshes/MeshTest.h
+++ b/src/UnitTests/Meshes/MeshTest.h
@@ -16,6 +16,8 @@
 #include <TNL/Meshes/Topologies/Hexahedron.h>
 #include <TNL/Meshes/MeshBuilder.h>
 
+#include "EntityTests.h"
+
 namespace MeshTest {
 
 using namespace TNL;
@@ -130,6 +132,7 @@ void testFinishedMesh( const Mesh& mesh )
    compareStringRepresentation( mesh, mesh2 );
    testCopyAssignment( mesh );
    testMeshOnCuda( mesh );
+   testEntities( mesh );
 }
 
 TEST( MeshTest, TwoTrianglesTest )