From a79560df728af6dec77bdc0c1c322070de8fb9a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 8 Apr 2021 00:15:23 +0200
Subject: [PATCH 01/13] Moved small examples from src/Examples into
 Documentation/Examples

---
 Documentation/Examples/CMakeLists.txt                        | 3 +++
 {src => Documentation}/Examples/ConfigDescriptionExample.cpp | 0
 {src => Documentation}/Examples/LoggerExample.cpp            | 0
 {src => Documentation}/Examples/MathExample.cpp              | 0
 src/Examples/CMakeLists.txt                                  | 5 -----
 5 files changed, 3 insertions(+), 5 deletions(-)
 rename {src => Documentation}/Examples/ConfigDescriptionExample.cpp (100%)
 rename {src => Documentation}/Examples/LoggerExample.cpp (100%)
 rename {src => Documentation}/Examples/MathExample.cpp (100%)

diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index ca8662ad0..29ba5a5df 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -28,6 +28,9 @@ ADD_EXECUTABLE( ObjectExample_getType ObjectExample_getType.cpp )
 ADD_CUSTOM_COMMAND( COMMAND ObjectExample_getType > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ObjectExample_getType.out OUTPUT ObjectExample_getType.out )
 
 ADD_EXECUTABLE( ParameterContainerExample ParameterContainerExample.cpp )
+ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
+ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
+ADD_EXECUTABLE( MathExample MathExample.cpp )
 
 ADD_EXECUTABLE( ParseObjectTypeExample ParseObjectTypeExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND ParseObjectTypeExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParseObjectTypeExample.out OUTPUT ParseObjectTypeExample.out )
diff --git a/src/Examples/ConfigDescriptionExample.cpp b/Documentation/Examples/ConfigDescriptionExample.cpp
similarity index 100%
rename from src/Examples/ConfigDescriptionExample.cpp
rename to Documentation/Examples/ConfigDescriptionExample.cpp
diff --git a/src/Examples/LoggerExample.cpp b/Documentation/Examples/LoggerExample.cpp
similarity index 100%
rename from src/Examples/LoggerExample.cpp
rename to Documentation/Examples/LoggerExample.cpp
diff --git a/src/Examples/MathExample.cpp b/Documentation/Examples/MathExample.cpp
similarity index 100%
rename from src/Examples/MathExample.cpp
rename to Documentation/Examples/MathExample.cpp
diff --git a/src/Examples/CMakeLists.txt b/src/Examples/CMakeLists.txt
index 652815874..ee71c4b5e 100644
--- a/src/Examples/CMakeLists.txt
+++ b/src/Examples/CMakeLists.txt
@@ -10,8 +10,3 @@ add_subdirectory( inviscid-flow-vl )
 add_subdirectory( flow )
 add_subdirectory( flow-sw )
 add_subdirectory( flow-vl )
-
-
-ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
-ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
-ADD_EXECUTABLE( MathExample MathExample.cpp )
-- 
GitLab


From 410ed20cfea5dae6924360e8006c60cb87550a48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 4 Apr 2021 23:13:49 +0200
Subject: [PATCH 02/13] Improved ParallelForTest

---
 .../Tutorials/ForLoops/ParallelFor2D-snippet.cpp   |  3 ---
 .../Tutorials/ForLoops/tutorial_ForLoops.md        |  6 +++++-
 src/UnitTests/Algorithms/ParallelForTest.h         | 14 ++++++++------
 3 files changed, 13 insertions(+), 10 deletions(-)
 delete mode 100644 Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp

diff --git a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
deleted file mode 100644
index 40f29313a..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-for( Index j = startY; j < endY; j++ )
-   for( Index i = startX; i < endX; i++ )
-      f( i, j, args... );
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 6b25c6d49..818a9a9a9 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -35,7 +35,11 @@ Performing for-loops in higher dimensions is simillar. In the following example
 
 Notice the parameters of the lambda function `sum`. The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to acces the vector elements sequentialy on CPU and to fullfill coalesced memory accesses on GPU. The for-loop is executed by calling `ParallelFor2D` with proper device. The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embeded for loops:
 
-\include ParallelFor2D-snippet.cpp
+```cpp
+for( Index j = startY; j < endY; j++ )
+   for( Index i = startX; i < endX; i++ )
+      f( i, j, args... );
+```
 
 where `args...` stand for additional arguments passed to the for-loop. After the parameters defining the loops bounds, lambda function (`sum` in this case) is passed followed by additional arguments. One of them, in our example, is `xSize` again because it must be passed to the lambda function for the index mapping computation.
 
diff --git a/src/UnitTests/Algorithms/ParallelForTest.h b/src/UnitTests/Algorithms/ParallelForTest.h
index aa75fd560..fe07247d2 100644
--- a/src/UnitTests/Algorithms/ParallelForTest.h
+++ b/src/UnitTests/Algorithms/ParallelForTest.h
@@ -8,6 +8,8 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#pragma once
+
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Array.h>
@@ -164,7 +166,7 @@ void test_1D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
@@ -200,7 +202,7 @@ void test_2D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -213,7 +215,7 @@ void test_2D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
@@ -249,7 +251,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -262,7 +264,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
 
       a.setValue( 0 );
@@ -275,7 +277,7 @@ void test_3D_cuda()
       ah = a;
       if( ah != expected ) {
          for (int i = 0; i < size; i++)
-            ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i;
+            ASSERT_EQ( ah[i], expected[i] ) << "First index at which the result is wrong is i = " << i;
       }
    }
 }
-- 
GitLab


From 142c2e5c39c355e726fdb236e333dc948a82cdcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 8 Apr 2021 20:05:28 +0200
Subject: [PATCH 03/13] Improved documentation for ParallelFor

---
 .../Algorithms/ParallelForExample-2D.cpp      | 12 +--
 .../Algorithms/ParallelForExample-3D.cpp      | 14 ++--
 .../Algorithms/ParallelForExample.cpp         | 12 +--
 .../Tutorials/ForLoops/CMakeLists.txt         | 18 +---
 .../ForLoops/ParallelForExample-2D.cu         |  1 -
 .../ForLoops/ParallelForExample-2D_ug.cpp     | 61 --------------
 .../ForLoops/ParallelForExample-3D.cu         |  1 -
 .../ForLoops/ParallelForExample-3D_ug.cpp     | 62 --------------
 .../Tutorials/ForLoops/ParallelForExample.cu  |  1 -
 .../ForLoops/ParallelForExample_ug.cpp        | 58 -------------
 .../Tutorials/ForLoops/tutorial_ForLoops.md   | 40 ++++++---
 src/TNL/Algorithms/ParallelFor.h              | 84 +++++++++++--------
 12 files changed, 97 insertions(+), 267 deletions(-)
 delete mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
 delete mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp
 delete mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
 delete mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp
 delete mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample.cu
 delete mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp

diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
index aafff2466..ecc53948c 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
@@ -13,10 +12,12 @@ void initMeshFunction( const int xSize,
                        Vector< double, Device >& v,
                        const double& c )
 {
-   auto view = v1.getConstView();
-   auto init = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
-      view[ j * xSize + i ] =  c; };
-   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
+   auto view = v.getView();
+   auto init = [=] __cuda_callable__ ( int i, int j ) mutable
+   {
+      view[ j * xSize + i ] = c;
+   };
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init );
 }
 
 int main( int argc, char* argv[] )
@@ -42,4 +43,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
index 3cb9b5b64..8eb5ff315 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
@@ -14,16 +13,18 @@ void initMeshFunction( const int xSize,
                        Vector< double, Device >& v,
                        const double& c )
 {
-   auto view = v1.getConstView();
-   auto init = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
-      view[ ( k * ySize + j ) * xSize + i ] =  c; };
-   ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
+   auto view = v.getView();
+   auto init = [=] __cuda_callable__ ( int i, int j, int k ) mutable
+   {
+      view[ ( k * ySize + j ) * xSize + i ] = c;
+   };
+   ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize, zSize, init );
 }
 
 int main( int argc, char* argv[] )
 {
    /***
-    * Define dimensions of 2D mesh function.
+    * Define dimensions of a 3D mesh function.
     */
    const int xSize( 10 ), ySize( 10 ), zSize( 10 );
    const int size = xSize * ySize * zSize;
@@ -43,4 +44,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
index 9c056fa1d..dd818856b 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -1,10 +1,10 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Algorithms;
 
 /****
  * Set all elements of the vector v to the constant c.
@@ -14,10 +14,11 @@ void initVector( Vector< double, Device >& v,
                  const double& c )
 {
    auto view = v.getView();
-   auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      view[ i ] = c; };
-
-   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+   auto init = [=] __cuda_callable__ ( int i ) mutable
+   {
+      view[ i ] = c;
+   };
+   ParallelFor< Device >::exec( 0, v.getSize(), init );
 }
 
 int main( int argc, char* argv[] )
@@ -39,4 +40,3 @@ int main( int argc, char* argv[] )
 #endif
    return EXIT_SUCCESS;
 }
-
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
index 74d8d1b0f..2bbfe30ad 100644
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -1,22 +1,10 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
-   CUDA_ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cu )
-   CUDA_ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cu )
-ELSE()
-   ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D_ug.cpp )
-   ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D_ug.cpp )
-ENDIF()
-
 ADD_EXECUTABLE( StaticForExample StaticForExample_ug.cpp )
 ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
 
 ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample_ug.cpp )
 ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
 
-IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
-   ParallelForExample.out
+ADD_CUSTOM_TARGET( ForLoops ALL DEPENDS
    StaticForExample.out
-   TemplateStaticForExample.out )
-ENDIF()
+   TemplateStaticForExample.out
+)
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
deleted file mode 120000
index 4a443ad3b..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample-2D_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp
deleted file mode 100644
index 388c326ec..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-2D_ug.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void meshFunctionSum( const int xSize,
-                      const int ySize,
-                      const Vector< double, Device >& v1,
-                      const Vector< double, Device >& v2,
-                      const double& c,
-                      Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
-      const int idx = j * xSize + i;
-      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
-
-   Algorithms::ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Define dimensions of 2D mesh function.
-    */
-   const int xSize( 10 ), ySize( 10 );
-   const int size = xSize * ySize;
-
-   /***
-    * Firstly, test the mesh functions sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
-   host_v1 = 1.0;
-   host_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, host_v1, host_v2, 2.0, host_result );
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
-   cuda_v1 = 1.0;
-   cuda_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, cuda_v1, cuda_v2, 2.0, cuda_result );
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
deleted file mode 120000
index 79ef7851f..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample-3D_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp
deleted file mode 100644
index 37e07c75e..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-3D_ug.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void meshFunctionSum( const int xSize,
-                      const int ySize,
-                      const int zSize,
-                      const Vector< double, Device >& v1,
-                      const Vector< double, Device >& v2,
-                      const double& c,
-                      Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
-      const int idx = ( k * ySize + j ) * xSize + i;
-      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
-
-   Algorithms::ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Define dimensions of 3D mesh function.
-    */
-   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
-   const int size = xSize * ySize * xSize;
-
-   /***
-    * Firstly, test the mesh functions sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
-   host_v1 = 1.0;
-   host_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, zSize, host_v1, host_v2, 2.0, host_result );
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
-   cuda_v1 = 1.0;
-   cuda_v2 = 2.0;
-   meshFunctionSum( xSize, ySize, zSize, cuda_v1, cuda_v2, 2.0, cuda_result );
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cu b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
deleted file mode 120000
index 79d405285..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ParallelForExample_ug.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp
deleted file mode 100644
index cf91d69ed..000000000
--- a/Documentation/Tutorials/ForLoops/ParallelForExample_ug.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/ParallelFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-template< typename Device >
-void vectorSum( const Vector< double, Device >& v1,
-                const Vector< double, Device >& v2,
-                const double& c,
-                Vector< double, Device >& result )
-{
-   /****
-    * Get vectors view which can be captured by lambda.
-    */
-   auto v1_view = v1.getConstView();
-   auto v2_view = v2.getConstView();
-   auto result_view = result.getView();
-
-   /****
-    * The sum function.
-    */
-   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
-
-   Algorithms::ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the vectors sum on CPU.
-    */
-   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
-   host_v1 = 1.0;
-   host_v2.forAllElements( []__cuda_callable__ ( int i, double& value ) { value = i; } );
-   vectorSum( host_v1, host_v2, 2.0, host_result );
-   std::cout << "host_v1 = " << host_v1 << std::endl;
-   std::cout << "host_v2 = " << host_v2 << std::endl;
-   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
-   cuda_v1 = 1.0;
-   cuda_v2.forAllElements( []__cuda_callable__ ( int i, double& value ) { value = i; } );
-   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
-   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
-   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
-   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 818a9a9a9..361672397 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -4,24 +4,32 @@
 
 ## Introduction
 
-This tutorial shows how to use different kind of for loops implemented in TNL. Namely, they are:
+This tutorial shows how to use different kind of for-loops implemented in TNL. Namely, they are:
 
-* **Parallel for** is a for loop which can be run in parallel, i.e. all iterations of the loop must be independent. Paralle for can run on both multicore CPUs and GPUs.
-* **n-dimensional Parallel For** is extension of common parallel for into more dimensions.
+* **Parallel for** is a for-loop which can be run in parallel, i.e. all iterations of the loop must be independent. Parallel for can be run on both multicore CPUs and GPUs.
+* **n-dimensional parallel for** is an extension of common parallel for into higher dimensions.
 * **Static For** is a for loop which is performed sequentialy and it is explicitly unrolled by C++ templates. Number of iterations must be static (known at compile time).
 * **Templated Static For** ....
 
 ## Parallel For
 
-Basic parallel for construction in TNL serves for hardware platform transparent expression of parallel for loops. The hardware platform is expressed by a template parameter. The parallel for is defined as:
+Basic _parallel for_ construction in TNL serves for hardware platform transparent expression of parallel for-loops.
+The hardware platform is specified by a template parameter.
+The loop is implemented as \ref TNL::Algorithms::ParallelFor and can be used as:
 
 ```
 ParallelFor< Device >::exec( start, end, function, arguments... );
 ```
 
-The `Device` can be either `Devices::Host` or `Devices::Cuda`. The first two parameters define the loop bounds in the C style. It means that there will be iterations for indexes `start` ... `end-1`. Function is a lambda function to be performed in each iteration. It is supposed to receive the iteration index and arguments passed to the parallel for (the last arguments). See the following example:
+The `Device` can be either \ref TNL::Devices::Host or \ref TNL::Devices::Cuda.
+The first two parameters define the loop bounds in the C style.
+It means that there will be iterations for indices `start`, `start+1`, ..., `end-1`.
+The `function` is a lambda function to be called in each iteration.
+It is supposed to receive the iteration index and arguments passed to the _parallel for_ (the last arguments).
 
-\include ParallelForExample_ug.cpp
+See the following example:
+
+\include ParallelForExample.cpp
 
 The result is:
 
@@ -29,11 +37,17 @@ The result is:
 
 ## n-dimensional Parallel For
 
-Performing for-loops in higher dimensions is simillar. In the following example we build 2D mesh function on top of TNL vector. Two dimensional indexes `( i, j )` are mapped to vector index `idx` as `idx = j * xSize + i`, where the mesh fuction has dimensions `xSize * ySize`. Of course, in this simple example, it does not make any sense to compute a sum of two mesh function this way, it is only an example.
+For-loops in higher dimensions can be performed similarly via \ref TNL::Algorithms::ParallelFor2D and \ref TNL::Algorithms::ParallelFor3D.
+In the following example we build a 2D mesh function on top of \ref TNL::Containers::Vector.
+Two dimensional indices `( i, j )` are mapped to the vector index `idx` as `idx = j * xSize + i`, where the mesh function has dimensions `xSize * ySize`.
+The following simple example performs initiation of the mesh function with a constant value `c = 1.0`:
 
-\include ParallelForExample-2D_ug.cpp
+\include ParallelForExample-2D.cpp
 
-Notice the parameters of the lambda function `sum`. The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to acces the vector elements sequentialy on CPU and to fullfill coalesced memory accesses on GPU. The for-loop is executed by calling `ParallelFor2D` with proper device. The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embeded for loops:
+Notice the parameters of the lambda function `init`.
+The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to access the vector elements sequentially on CPU and to fulfill coalesced memory accesses on GPU.
+The for-loop is executed by calling `ParallelFor2D` with proper device.
+The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embedded for-loops:
 
 ```cpp
 for( Index j = startY; j < endY; j++ )
@@ -41,11 +55,13 @@ for( Index j = startY; j < endY; j++ )
       f( i, j, args... );
 ```
 
-where `args...` stand for additional arguments passed to the for-loop. After the parameters defining the loops bounds, lambda function (`sum` in this case) is passed followed by additional arguments. One of them, in our example, is `xSize` again because it must be passed to the lambda function for the index mapping computation.
+where `args...` stand for additional arguments passed to the for-loop.
+After the parameters defining the loops bounds, lambda function (`init` in this case) is passed, followed by additional arguments that are forwarded to the lambda function after the iteration indices.
+In the example above there are no additional arguments, since the lambda function `init` captures all variables it needs to work with.
 
-For the completness, we show modification of the previous example into 3D:
+For completeness, we show modification of the previous example into 3D:
 
-\include ParallelForExample-3D_ug.cpp
+\include ParallelForExample-3D.cpp
 
 ## Static For
 
diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index cb096f879..97cf13c77 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -53,10 +53,11 @@ enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
 
 /**
- * \brief Parallel for loop for one dimensional interval of indexes.
+ * \brief Parallel for loop for one dimensional interval of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -64,16 +65,17 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param start the for-loop iterates over index interval [start, end).
-    * \param end the for-loop iterates over index interval [start, end).
-    * \param f is the function to be called in each iteration
+    * \param start is the left bound of the iteration range `[begin, end)`.
+    * \param end is the right bound of the iteration range `[begin, end)`.
+    * \param f is the function to be called in each iteration.
     * \param args are additional parameters to be passed to the function f.
     *
     * \par Example
@@ -93,10 +95,11 @@ struct ParallelFor
 };
 
 /**
- * \brief Parallel for loop for two dimensional domain of indexes.
+ * \brief Parallel for loop for two dimensional domain of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -104,23 +107,26 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor2D
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
-    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param startX the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param startY the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param endX the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
+    * \param endY the for-loop iterates over index domain `[startX,endX) x [startY,endY)`.
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
     *
     * The function f is called for each iteration as
     *
+    * \code
     * f( i, j, args... )
+    * \endcode
     *
     * where the first parameter is changing more often than the second one.
     *
@@ -142,10 +148,11 @@ struct ParallelFor2D
 };
 
 /**
- * \brief Parallel for loop for three dimensional domain of indexes.
+ * \brief Parallel for loop for three dimensional domain of indices.
  *
- * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Device specifies the device where the for-loop will be executed.
+ *    It can be \ref TNL::Devices::Host, \ref TNL::Devices::Cuda or
+ *    \ref TNL::Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -153,25 +160,28 @@ template< typename Device = Devices::Sequential,
 struct ParallelFor3D
 {
    /**
-    * \brief Static method for execution of the loop.
+    * \brief Static method for the execution of the loop.
     *
-    * \tparam Index defines the type of indexes over which the loop iterates.
-    * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are
-    *    supposed to be passed to the inner Function.
+    * \tparam Index is the type of the loop indices.
+    * \tparam Function is the type of the functor to be called in each iteration
+    *    (it is usually deduced from the argument used in the function call).
+    * \tparam FunctionArgs is a variadic pack of types for additional parameters
+    *    that are forwarded to the functor in every iteration.
     *
-    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
-    * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startX the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param startY the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param startZ the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endX the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endY the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
+    * \param endZ the for-loop iterates over index domain `[startX,endX) x [startY,endY) x [startZ,endZ)`.
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
     *
     * The function f is called for each iteration as
     *
+    * \code
     * f( i, j, k, args... )
+    * \endcode
     *
     * where the first parameter is changing the most often.
     *
-- 
GitLab


From ad7ddcf3103232c8121c8bc25697a58e658e376b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 7 Apr 2021 22:35:48 +0200
Subject: [PATCH 04/13] Refactoring: renamed StaticFor to UnrolledFor

The main purpose of this algorithm is to do loop unrolling, not to
handle static/constant bounds. The true StaticFor will handle the
iteration indices in such a way that they can be used in constant
expressions (e.g. passed as template arguments).
---
 .../Examples/Algorithms/CMakeLists.txt        | 12 +++---
 ...cForExample.cpp => UnrolledForExample.cpp} | 11 ++++--
 .../Tutorials/ForLoops/CMakeLists.txt         |  4 --
 .../Tutorials/ForLoops/StaticForExample-2.cpp |  4 --
 .../Tutorials/ForLoops/StaticForExample-3.cpp |  1 -
 .../ForLoops/StaticForExample_ug.cpp          | 28 -------------
 .../Tutorials/ForLoops/tutorial_ForLoops.md   | 39 +++++++++++++------
 .../Algorithms/{StaticFor.h => UnrolledFor.h} | 30 +++++++-------
 src/TNL/Containers/StaticArray.hpp            | 14 +++----
 src/TNL/Containers/StaticVector.hpp           |  2 +-
 .../Containers/detail/StaticArrayAssignment.h |  6 +--
 11 files changed, 67 insertions(+), 84 deletions(-)
 rename Documentation/Examples/Algorithms/{StaticForExample.cpp => UnrolledForExample.cpp} (69%)
 delete mode 100644 Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
 delete mode 100644 Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
 delete mode 100644 Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp
 rename src/TNL/Algorithms/{StaticFor.h => UnrolledFor.h} (69%)

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index d0d1eda9b..82d0b3a91 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -6,12 +6,10 @@ ELSE()
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
 ENDIF()
 
-IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
-   ParallelForExample.out
- )
-ELSE()
+ADD_EXECUTABLE(UnrolledForExample UnrolledForExample.cpp)
+ADD_CUSTOM_COMMAND( COMMAND UnrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UnrolledForExample.out OUTPUT UnrolledForExample.out )
+
 ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    ParallelForExample.out
- )
-ENDIF()
\ No newline at end of file
+   UnrolledForExample.out
+)
diff --git a/Documentation/Examples/Algorithms/StaticForExample.cpp b/Documentation/Examples/Algorithms/UnrolledForExample.cpp
similarity index 69%
rename from Documentation/Examples/Algorithms/StaticForExample.cpp
rename to Documentation/Examples/Algorithms/UnrolledForExample.cpp
index 47757458d..05e438cfd 100644
--- a/Documentation/Examples/Algorithms/StaticForExample.cpp
+++ b/Documentation/Examples/Algorithms/UnrolledForExample.cpp
@@ -1,7 +1,6 @@
 #include <iostream>
-#include <cstdlib>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/UnrolledFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
@@ -20,8 +19,12 @@ int main( int argc, char* argv[] )
    /****
     * Compute an addition of a vector and a constant number.
     */
-   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
-   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   auto addition = [&]( int i, const double& c )
+   {
+      a[ i ] = b[ i ] + c;
+      sum += a[ i ];
+   };
+   Algorithms::UnrolledFor< 0, Size >::exec( addition, 3.14 );
    std::cout << "a = " << a << std::endl;
    std::cout << "sum = " << sum << std::endl;
 }
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
index 2bbfe30ad..6eaa5dd07 100644
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -1,10 +1,6 @@
-ADD_EXECUTABLE( StaticForExample StaticForExample_ug.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
-
 ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample_ug.cpp )
 ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
 
 ADD_CUSTOM_TARGET( ForLoops ALL DEPENDS
-   StaticForExample.out
    TemplateStaticForExample.out
 )
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
deleted file mode 100644
index 7ee4afd72..000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-for( int i = 0; i < Size; i++ )
-{
-   a[ i ] = b[ i ] + c; sum += a[ i ];
-};
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
deleted file mode 100644
index 5298b00a1..000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Algorithms::StaticFor< 0, Size, true >::exec( addition, 3.14 );
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp b/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp
deleted file mode 100644
index 47757458d..000000000
--- a/Documentation/Tutorials/ForLoops/StaticForExample_ug.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/StaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Create two static vectors
-    */
-   const int Size( 3 );
-   StaticVector< Size, double > a, b;
-   a = 1.0;
-   b = 2.0;
-   double sum( 0.0 );
-
-   /****
-    * Compute an addition of a vector and a constant number.
-    */
-   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
-   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
-   std::cout << "a = " << a << std::endl;
-   std::cout << "sum = " << sum << std::endl;
-}
-
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 361672397..135870435 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -8,7 +8,7 @@ This tutorial shows how to use different kind of for-loops implemented in TNL. N
 
 * **Parallel for** is a for-loop which can be run in parallel, i.e. all iterations of the loop must be independent. Parallel for can be run on both multicore CPUs and GPUs.
 * **n-dimensional parallel for** is an extension of common parallel for into higher dimensions.
-* **Static For** is a for loop which is performed sequentialy and it is explicitly unrolled by C++ templates. Number of iterations must be static (known at compile time).
+* **Unrolled for** is a for-loop which is performed sequentially and it is explicitly unrolled by C++ templates. Iteration bounds must be static (known at compile time).
 * **Templated Static For** ....
 
 ## Parallel For
@@ -63,27 +63,44 @@ For completeness, we show modification of the previous example into 3D:
 
 \include ParallelForExample-3D.cpp
 
-## Static For
+## Unrolled For
 
-Static for-loop is designed for short loops with constant (i.e. known at the compile time) number of iterations. It is often used with static arrays and vectors. An adventage of this kind of for loop is that it is explicitly unrolled when the loop is short (up to eight iterations). See the following example:
+\ref TNL::Algorithms::UnrolledFor is a for-loop that it is explicitly unrolled via C++ templates when the loop is short (up to eight iterations).
+The bounds of `UnrolledFor` loops must be constant (i.e. known at the compile time).
+It is often used with static arrays and vectors.
 
-\include StaticForExample_ug.cpp
+See the following example:
+
+\include UnrolledForExample.cpp
 
-Notice that the static for-loop works with a lambda function simillar to parallel for-loop. The bounds of the loop are passed as template parameters in the statement `Algorithms::StaticFor< 0, Size >`. The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliar data to be passed to the function. The function gets the loop index `i` first followed by the auxiliary data `sum` in this example.
+Notice that the unrolled for-loop works with a lambda function similar to parallel for-loop.
+The bounds of the loop are passed as template parameters in the statement `Algorithms::UnrolledFor< 0, Size >`.
+The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliary data to be passed to the function.
+The function gets the loop index `i` first followed by the auxiliary data `sum` in this example.
 
 The result looks as:
 
-\include StaticForExample.out
+\include UnrolledForExample.out
 
-The effect of `StaticFor` is really the same as usual for-loop. The following code does the same as the previous example:
+The effect of `UnrolledFor` is really the same as usual for-loop.
+The following code does the same as the previous example:
 
-\include StaticForExample-2.cpp
+```cpp
+for( int i = 0; i < Size; i++ )
+{
+   a[ i ] = b[ i ] + 3.14;
+   sum += a[ i ];
+};
+```
 
-The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some situations. `StaticFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
+The benefit of `UnrolledFor` is mainly in the explicit unrolling of short loops which can improve performance in some situations.
+`UnrolledFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
 
-\include StaticForExample-3.cpp
+```cpp
+Algorithms::UnrolledFor< 0, Size, true >::exec( addition, 3.14 );
+```
 
-`StaticFor` can be used also in CUDA kernels.
+`UnrolledFor` can be used also in CUDA kernels.
 
 ## Templated Static For
 
diff --git a/src/TNL/Algorithms/StaticFor.h b/src/TNL/Algorithms/UnrolledFor.h
similarity index 69%
rename from src/TNL/Algorithms/StaticFor.h
rename to src/TNL/Algorithms/UnrolledFor.h
index 6a450638f..710f2a0d3 100644
--- a/src/TNL/Algorithms/StaticFor.h
+++ b/src/TNL/Algorithms/UnrolledFor.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          StaticFor.h  -  description
+                          UnrolledFor.h  -  description
                              -------------------
     begin                : Jul 16, 2019
     copyright            : (C) 2019 by Tomas Oberhuber
@@ -10,16 +10,18 @@
 
 #pragma once
 
+#include <utility>
+
 #include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace Algorithms {
 
 /**
- * \brief StaticFor is a wrapper for common for-loop with explicit unrolling.
+ * \brief UnrolledFor is a wrapper for common for-loop with explicit unrolling.
  *
- * StaticFor can be used only for for-loops bounds of which are known at the
- * compile time. StaticFor performs explicit loop unrolling for better performance.
+ * UnrolledFor can be used only for for-loops bounds of which are known at the
+ * compile time. UnrolledFor performs explicit loop unrolling for better performance.
  * This, however, does not make sense for loops with a large iterations
  * count. For a very large iterations count it could trigger the compiler's
  * limit on recursive template instantiation. Also note that the compiler
@@ -33,20 +35,20 @@ namespace Algorithms {
  *   unrolling is performed.
  *
  * \par Example
- * \include Algorithms/StaticForExample.cpp
+ * \include Algorithms/UnrolledForExample.cpp
  * \par Output
- * \include StaticForExample.out
+ * \include UnrolledForExample.out
  */
 template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
-struct StaticFor;
+struct UnrolledFor;
 
 template< int Begin, int End >
-struct StaticFor< Begin, End, true >
+struct UnrolledFor< Begin, End, true >
 {
-   static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
+   static_assert( Begin < End, "Wrong index interval for UnrolledFor. Begin must be less than end." );
 
    /**
-    * \brief Static method for execution od the StaticFor.
+    * \brief Static method for the execution of the UnrolledFor.
     *
     * \param f is a (lambda) function to be performed in each iteration.
     * \param args are auxiliary data to be passed to the function f.
@@ -56,12 +58,12 @@ struct StaticFor< Begin, End, true >
    static void exec( const Function& f, Args&&... args )
    {
       f( Begin, args... );
-      StaticFor< Begin + 1, End >::exec( f, std::forward< Args >( args )... );
+      UnrolledFor< Begin + 1, End >::exec( f, std::forward< Args >( args )... );
    }
 };
 
 template< int End >
-struct StaticFor< End, End, true >
+struct UnrolledFor< End, End, true >
 {
    template< typename Function, typename... Args >
    __cuda_callable__
@@ -69,9 +71,9 @@ struct StaticFor< End, End, true >
 };
 
 template< int Begin, int End >
-struct StaticFor< Begin, End, false >
+struct UnrolledFor< Begin, End, false >
 {
-   static_assert( Begin <= End, "Wrong index interval for StaticFor. Begin must be less than or equal to end." );
+   static_assert( Begin <= End, "Wrong index interval for UnrolledFor. Begin must be less than or equal to end." );
 
    template< typename Function, typename... Args >
    __cuda_callable__
diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index c6af2e4ed..87e3eea9d 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Math.h>
 #include <TNL/Containers/StaticArray.h>
 #include <TNL/Containers/detail/StaticArrayAssignment.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/UnrolledFor.h>
 
 namespace TNL {
 namespace Containers {
@@ -102,21 +102,21 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value v[ Size ] )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value& v )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), v );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), v );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const StaticArray< Size, Value >& v )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v.getData() );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v.getData() );
 }
 
 template< int Size, typename Value >
@@ -228,7 +228,7 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >& StaticArray< Size, Value >::operator=( const StaticArray< Size, Value >& array )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), array.getData() );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), array.getData() );
    return *this;
 }
 
@@ -264,7 +264,7 @@ StaticArray< Size, Value >::
 operator StaticArray< Size, OtherValue >() const
 {
    StaticArray< Size, OtherValue > aux;
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), getData() );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), getData() );
    return aux;
 }
 
@@ -272,7 +272,7 @@ template< int Size, typename Value >
 __cuda_callable__
 void StaticArray< Size, Value >::setValue( const ValueType& val )
 {
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), val );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), val );
 }
 
 template< int Size, typename Value >
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index bbcf8a09f..d021cd78d 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -99,7 +99,7 @@ StaticVector< Size, Real >::
 operator StaticVector< Size, OtherReal >() const
 {
    StaticVector< Size, OtherReal > aux;
-   Algorithms::StaticFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), this->getData() );
+   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), this->getData() );
    return aux;
 }
 
diff --git a/src/TNL/Containers/detail/StaticArrayAssignment.h b/src/TNL/Containers/detail/StaticArrayAssignment.h
index 6ba6c8e02..eae5f474b 100644
--- a/src/TNL/Containers/detail/StaticArrayAssignment.h
+++ b/src/TNL/Containers/detail/StaticArrayAssignment.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/TypeTraits.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/UnrolledFor.h>
 
 namespace TNL {
 namespace Containers {
@@ -53,7 +53,7 @@ struct StaticArrayAssignment< StaticArray, T, true >
    static void assign( StaticArray& a, const T& v )
    {
       static_assert( StaticArray::getSize() == T::getSize(), "Cannot assign static arrays with different size." );
-      Algorithms::StaticFor< 0, StaticArray::getSize() >::exec( AssignArrayFunctor{}, a.getData(), v.getData() );
+      Algorithms::UnrolledFor< 0, StaticArray::getSize() >::exec( AssignArrayFunctor{}, a.getData(), v.getData() );
    }
 };
 
@@ -68,7 +68,7 @@ struct StaticArrayAssignment< StaticArray, T, false >
    __cuda_callable__
    static void assign( StaticArray& a, const T& v )
    {
-      Algorithms::StaticFor< 0, StaticArray::getSize() >::exec( AssignValueFunctor{}, a.getData(), v );
+      Algorithms::UnrolledFor< 0, StaticArray::getSize() >::exec( AssignValueFunctor{}, a.getData(), v );
    }
 };
 
-- 
GitLab


From 4f8d1e21a7a60cb02dc5974b6ab6d99672049a79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 3 Apr 2021 23:48:26 +0200
Subject: [PATCH 05/13] Refactoring: reimplemented TemplateStaticFor with
 constexpr functions and generic lambdas

- interface renamed from `TemplateStaticFor` to `staticFor`
- also added tests and updated documentation, read it for details
---
 .../Examples/Algorithms/CMakeLists.txt        |   4 +
 .../Algorithms/TemplateStaticForExample.cpp   |  31 ----
 .../Examples/Algorithms/staticForExample.cpp  |  16 ++
 Documentation/Tutorials/CMakeLists.txt        |   1 -
 .../Tutorials/ForLoops/CMakeLists.txt         |   6 -
 .../ForLoops/TemplateStaticForExample_ug.cpp  |  32 ----
 .../Tutorials/ForLoops/tutorial_ForLoops.md   |  33 ++--
 src/TNL/Algorithms/TemplateStaticFor.h        | 130 --------------
 src/TNL/Algorithms/staticFor.h                | 112 +++++++++++++
 src/UnitTests/Algorithms/CMakeLists.txt       |   1 +
 src/UnitTests/Algorithms/staticForTest.cpp    |   1 +
 src/UnitTests/Algorithms/staticForTest.cu     |   1 +
 src/UnitTests/Algorithms/staticForTest.h      | 158 ++++++++++++++++++
 13 files changed, 308 insertions(+), 218 deletions(-)
 delete mode 100644 Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
 create mode 100644 Documentation/Examples/Algorithms/staticForExample.cpp
 delete mode 100644 Documentation/Tutorials/ForLoops/CMakeLists.txt
 delete mode 100644 Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp
 delete mode 100644 src/TNL/Algorithms/TemplateStaticFor.h
 create mode 100644 src/TNL/Algorithms/staticFor.h
 create mode 100644 src/UnitTests/Algorithms/staticForTest.cpp
 create mode 100644 src/UnitTests/Algorithms/staticForTest.cu
 create mode 100644 src/UnitTests/Algorithms/staticForTest.h

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 82d0b3a91..87b544683 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -6,10 +6,14 @@ ELSE()
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
 ENDIF()
 
+ADD_EXECUTABLE(staticForExample staticForExample.cpp)
+ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out )
+
 ADD_EXECUTABLE(UnrolledForExample UnrolledForExample.cpp)
 ADD_CUSTOM_COMMAND( COMMAND UnrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UnrolledForExample.out OUTPUT UnrolledForExample.out )
 
 ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    ParallelForExample.out
    UnrolledForExample.out
+   staticForExample.out
 )
diff --git a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
deleted file mode 100644
index a2fce79ae..000000000
--- a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-const int Size( 5 );
-
-template< int I >
-struct LoopBody
-{
-   static void exec( const StaticVector< Size, double >& v ) {
-      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
-   }
-};
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Initiate static vector
-    */
-   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
-
-   /****
-    * Print out the vector using template parameters for indexing.
-    */
-   Algorithms::TemplateStaticFor< 0, Size, LoopBody >::exec( v );
-}
-
diff --git a/Documentation/Examples/Algorithms/staticForExample.cpp b/Documentation/Examples/Algorithms/staticForExample.cpp
new file mode 100644
index 000000000..c3c4a68f4
--- /dev/null
+++ b/Documentation/Examples/Algorithms/staticForExample.cpp
@@ -0,0 +1,16 @@
+#include <iostream>
+#include <array>
+#include <TNL/Algorithms/staticFor.h>
+
+int main( int argc, char* argv[] )
+{
+   // initiate std::array
+   std::array< int, 5 > a{ 1, 2, 3, 4, 5 };
+
+   // print out the array using template parameters for indexing
+   TNL::Algorithms::staticFor< int, 0, 5 >(
+      [&a] ( auto i ) {
+         std::cout << "a[ " << i << " ] = " << std::get< i >( a ) << std::endl;
+      }
+   );
+}
diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 5511d0633..05ed1f33c 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory( GeneralConcepts )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
-add_subdirectory( ForLoops )
 add_subdirectory( Pointers )
 add_subdirectory( Matrices )
 add_subdirectory( Meshes )
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
deleted file mode 100644
index 6eaa5dd07..000000000
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample_ug.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
-
-ADD_CUSTOM_TARGET( ForLoops ALL DEPENDS
-   TemplateStaticForExample.out
-)
diff --git a/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp b/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp
deleted file mode 100644
index eb65fd6cc..000000000
--- a/Documentation/Tutorials/ForLoops/TemplateStaticForExample_ug.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-
-using Index = int;
-const Index Size( 5 );
-
-template< Index I >
-struct LoopBody
-{
-   static void exec( const StaticVector< Size, double >& v ) {
-      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
-   }
-};
-
-int main( int argc, char* argv[] )
-{
-   /****
-    * Initiate static vector
-    */
-   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
-
-   /****
-    * Print out the vector using template parameters for indexing.
-    */
-   Algorithms::TemplateStaticFor< Index, 0, Size, LoopBody >::exec( v );
-}
-
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 135870435..9e1663102 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -9,7 +9,7 @@ This tutorial shows how to use different kind of for-loops implemented in TNL. N
 * **Parallel for** is a for-loop which can be run in parallel, i.e. all iterations of the loop must be independent. Parallel for can be run on both multicore CPUs and GPUs.
 * **n-dimensional parallel for** is an extension of common parallel for into higher dimensions.
 * **Unrolled for** is a for-loop which is performed sequentially and it is explicitly unrolled by C++ templates. Iteration bounds must be static (known at compile time).
-* **Templated Static For** ....
+* **Static for** is a for-loop with static bounds (known at compile time) and indices usable in constant expressions.
 
 ## Parallel For
 
@@ -102,32 +102,29 @@ Algorithms::UnrolledFor< 0, Size, true >::exec( addition, 3.14 );
 
 `UnrolledFor` can be used also in CUDA kernels.
 
-## Templated Static For
+## Static For
 
-Templated static for-loop (`TemplateStaticFor`) is a for-loop in template parameters. For example, if class `LoopBody` is defined as
+\ref TNL::Algorithms::staticFor is a generic for-loop whose iteration indices are usable in constant expressions (e.g. template arguments). It can be used as
 
-```
-template< int i >
-struct LoopBody
-{
-   static void exec() { ... };
-}
+```cpp
+staticFor< int, 0, N >( f );
 ```
 
-one might need to execute the following sequence of statements:
+which is results in the following sequence of function calls:
 
-```
-LoopBody< 0 >::exec();
-LoopBody< 1 >::exec();
-LoopBody< 3 >::exec();
+```cpp
+f( std::integral_constant< 0 >{} );
+f( std::integral_constant< 1 >{} );
+f( std::integral_constant< 2 >{} );
+f( std::integral_constant< 3 >{} );
 ...
-LoodBody< N >::exec();
+f( std::integral_constant< N >{} );
 ```
 
-This is exactly what `TemplateStaticFor` can do - in a slightly more general way. See the following example:
+Notice that each iteration index is represented by its own distinct type using \ref std::integral_constant. Hence, the functor `f` must be generic, e.g. a _generic lambda expression_ such as in the following example:
 
-\include TemplateStaticForExample.cpp
+\include staticForExample.cpp
 
 The output looks as follows:
 
-\include TemplateStaticForExample.out
+\include staticForExample.out
diff --git a/src/TNL/Algorithms/TemplateStaticFor.h b/src/TNL/Algorithms/TemplateStaticFor.h
deleted file mode 100644
index c96c816dc..000000000
--- a/src/TNL/Algorithms/TemplateStaticFor.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************
-                          TemplateStaticFor.h  -  description
-                             -------------------
-    begin                : Feb 23, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <utility>
-#include <type_traits>
-
-#include <TNL/Cuda/CudaCallable.h>
-
-namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief TemplateStaticFor serves for coding for-loops in template parameters.
- *
- * The result of calling this loop with a templated class \p LoopBody is as follows:
- *
- * LoopBody< begin >::exec( ... );
- *
- * LoodBody< begin + 1 >::exec( ... );
- *
- * ...
- *
- * LoopBody< end - 1 >::exec( ... );
- *
- * \tparam IndexType is type of the loop indexes
- * \tparam begin the loop iterates over index interval [begin,end).
- * \tparam end the loop iterates over index interval [begin,end).
- * \tparam LoopBody is a templated class having one template parameter of IndexType.
- *
- * \par Example
- * \include Algorithms/TamplateStaticForExample.cpp
- * \par Output
- * \include TamplateStaticForExample.out
- */
-template< typename IndexType,
-          IndexType begin,
-          IndexType end,
-          template< IndexType > class LoopBody >
-struct TemplateStaticFor;
-
-namespace detail {
-
-template< typename IndexType,
-          typename Begin,
-          typename N,
-          template< IndexType > class LoopBody >
-struct TemplateStaticForExecutor
-{
-   /**
-    * \brief Static method initiating the for-loop.
-    *
-    * \tparam Args type of user defined data to be passed to for-loop.
-    * \param args user defined data to be passed to for-loop.
-    */
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {
-      using Decrement = std::integral_constant< IndexType, N::value - 1 >;
-      TemplateStaticForExecutor< IndexType, Begin, Decrement, LoopBody >::exec( std::forward< Args >( args )... );
-      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
-   }
-
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {
-      using Decrement = std::integral_constant< IndexType, N::value - 1 >;
-      TemplateStaticForExecutor< IndexType, Begin, Decrement, LoopBody >::execHost( std::forward< Args >( args )... );
-      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
-   }
-};
-
-template< typename IndexType,
-          typename Begin,
-          template< IndexType > class LoopBody >
-struct TemplateStaticForExecutor< IndexType,
-                                  Begin,
-                                  std::integral_constant< IndexType, 0 >,
-                                  LoopBody >
-{
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {}
-
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {}
-};
-
-} // namespace detail
-
-template< typename IndexType,
-          IndexType begin,
-          IndexType end,
-          template< IndexType > class LoopBody >
-struct TemplateStaticFor
-{
-   template< typename... Args >
-   __cuda_callable__
-   static void exec( Args&&... args )
-   {
-      detail::TemplateStaticForExecutor< IndexType,
-                                 std::integral_constant< IndexType, begin >,
-                                 std::integral_constant< IndexType, end - begin >,
-                                 LoopBody >::exec( std::forward< Args >( args )... );
-   }
-
-   // nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above
-   template< typename... Args >
-   static void execHost( Args&&... args )
-   {
-      detail::TemplateStaticForExecutor< IndexType,
-                                 std::integral_constant< IndexType, begin >,
-                                 std::integral_constant< IndexType, end - begin >,
-                                 LoopBody >::execHost( std::forward< Args >( args )... );
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/staticFor.h b/src/TNL/Algorithms/staticFor.h
new file mode 100644
index 000000000..ebbfe5ae3
--- /dev/null
+++ b/src/TNL/Algorithms/staticFor.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+                          staticFor.h  -  description
+                             -------------------
+    begin                : Feb 23, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+namespace TNL {
+namespace Algorithms {
+
+namespace detail {
+#if __cplusplus >= 201703L
+
+// C++17 version using fold expression
+template< typename Index, Index begin,  typename Func, Index... idx >
+constexpr void static_for_impl( Func &&f, std::integer_sequence< Index, idx... > )
+{
+   ( f( std::integral_constant<Index, begin + idx>{} ), ... );
+}
+
+#else
+
+// C++14 version using recursion and variadic pack
+template< typename Index, Index begin,  typename Func, Index idx >
+constexpr void static_for_impl( Func &&f, std::integer_sequence< Index, idx > )
+{
+   f( std::integral_constant<Index, begin + idx>{} );
+}
+
+template< typename Index, Index begin,  typename Func, Index idx, Index... indices >
+// WTF why, clang, why...
+//constexpr void
+constexpr std::enable_if_t< sizeof...(indices) >= 1 >
+static_for_impl( Func &&f, std::integer_sequence< Index, idx, indices... > )
+{
+   static_for_impl< Index, begin >(
+         std::forward< Func >( f ),
+         std::integer_sequence< Index, idx >{}
+   );
+   static_for_impl< Index, begin >(
+         std::forward< Func >( f ),
+         std::integer_sequence< Index, indices... >{}
+   );
+}
+
+#endif
+
+// general specialization for `begin < end`
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin < end) >
+static_for_dispatch( Func &&f )
+{
+   static_for_impl< Index, begin >(
+         std::forward< Func >( f ),
+         std::make_integer_sequence< Index, end - begin >{}
+   );
+}
+
+// specialization for `begin >= end` (i.e. empty loop)
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+static_for_dispatch( Func &&f )
+{}
+
+} // namespace detail
+
+/**
+ * \brief Generic loop with constant bounds and indices usable in constant
+ * expressions.
+ *
+ * \e staticFor is a generic C++14/C++17 implementation of a static for-loop
+ * using \e constexpr functions and template metaprogramming. It is equivalent
+ * to executing a function $f(i)$ for arguments $i$ from the integral range
+ * `[begin, end)`, but with the type \ref std::integral_constant rather than
+ * `int` or `std::size_t` representing the indices. Hence, each index has its
+ * own distinct C++ type and the \e value of the index can be deduced from the
+ * type.
+ *
+ * Also note that thanks to `constexpr`, the argument $i$ can be used in
+ * constant expressions and the \e staticFor function can be used from the host
+ * code as well as CUDA kernels (TNL requires the `--expt-relaxed-constexpr`
+ * parameter when compiled by `nvcc`).
+ *
+ * \tparam Index is the type of the loop indices.
+ * \tparam begin is the left bound of the iteration range `[begin, end)`.
+ * \tparam end is the right bound of the iteration range `[begin, end)`.
+ * \tparam Func is the type of the functor (it is usually deduced from the
+ *    argument used in the function call).
+ *
+ * \param f is the functor to be called in each iteration.
+ *
+ * \par Example
+ * \include Algorithms/staticForExample.cpp
+ * \par Output
+ * \include staticForExample.out
+ */
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr void staticFor( Func&& f )
+{
+   detail::static_for_dispatch< Index, begin, end >( std::forward< Func >( f ) );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 30ea96b4d..dd269c8bc 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -4,6 +4,7 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
+         staticForTest
 )
 
 set( CPP_TESTS )
diff --git a/src/UnitTests/Algorithms/staticForTest.cpp b/src/UnitTests/Algorithms/staticForTest.cpp
new file mode 100644
index 000000000..c40d3d9b9
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.cpp
@@ -0,0 +1 @@
+#include "staticForTest.h"
diff --git a/src/UnitTests/Algorithms/staticForTest.cu b/src/UnitTests/Algorithms/staticForTest.cu
new file mode 100644
index 000000000..c40d3d9b9
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.cu
@@ -0,0 +1 @@
+#include "staticForTest.h"
diff --git a/src/UnitTests/Algorithms/staticForTest.h b/src/UnitTests/Algorithms/staticForTest.h
new file mode 100644
index 000000000..4a44f65e2
--- /dev/null
+++ b/src/UnitTests/Algorithms/staticForTest.h
@@ -0,0 +1,158 @@
+/***************************************************************************
+                          staticForTest.h  -  description
+                             -------------------
+    begin                : Apr 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <array>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/staticFor.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+
+#ifdef HAVE_GTEST
+TEST( staticForTest, host_dynamic )
+{
+   constexpr int N = 5;
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   staticFor< int, 0, N >(
+      [&a] ( auto i ) {
+         a[ i ] += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( staticForTest, host_static )
+{
+   constexpr int N = 5;
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   staticFor< int, 0, N >(
+      [&a] ( auto i ) {
+         std::get< i >( a ) += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( staticForTest, host_empty )
+{
+   bool called = false;
+
+   staticFor< int, 0, 0 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+
+   staticFor< int, 0, -1 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+}
+
+#ifdef HAVE_CUDA
+// nvcc does not allow __cuda_callable__ lambdas inside private regions
+void test_cuda_dynamic()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   constexpr int N = 5;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      staticFor< int, 0, N >(
+         [&view] ( auto i ) {
+            view[ i ] += 1;
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( staticForTest, cuda_dynamic )
+{
+   test_cuda_dynamic();
+}
+
+template< int i, typename View >
+__cuda_callable__
+void static_helper( View& view )
+{
+   view[ i ] += 1;
+}
+
+// nvcc does not allow __cuda_callable__ lambdas inside private regions
+void test_cuda_static()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   constexpr int N = 5;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      staticFor< int, 0, N >(
+         [&view] ( auto i ) {
+            static_helper< i >( view );
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( staticForTest, cuda_static )
+{
+   test_cuda_static();
+}
+#endif
+#endif
+
+#include "../main.h"
-- 
GitLab


From 69deca314f38bc1b18ee225be7c60590c1cadc72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 8 Apr 2021 13:23:17 +0200
Subject: [PATCH 06/13] Updated code using TemplateStaticFor to use staticFor

---
 src/TNL/Containers/DistributedNDArray.h       |  39 +--
 .../DistributedNDArraySynchronizer.h          | 288 +++++++++---------
 src/TNL/Containers/ndarray/SizesHolder.h      |  82 ++---
 .../Containers/ndarray/SizesHolderHelpers.h   |  20 +-
 .../NeighborGridEntityGetter1D_impl.h         |  98 +++---
 .../NeighborGridEntityGetter2D_impl.h         | 108 +++----
 .../NeighborGridEntityGetter3D_impl.h         | 158 ++++------
 .../MeshDetails/IndexPermutationApplier.h     |  57 ++--
 .../initializer/SubentitySeedsCreator.h       |  39 +--
 .../layers/EntityTags/Initializer.h           |  48 +--
 10 files changed, 424 insertions(+), 513 deletions(-)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 48de78e78..22e67a36e 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -391,7 +391,22 @@ public:
    void allocate()
    {
       SizesHolderType localSizes;
-      Algorithms::TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds );
+      Algorithms::staticFor< std::size_t, 0, SizesHolderType::getDimension() >(
+         [&] ( auto level ) {
+            if( SizesHolderType::template getStaticSize< level >() != 0 )
+               return;
+
+            const auto begin = localBegins.template getSize< level >();
+            const auto end = localEnds.template getSize< level >();
+            if( begin == end )
+               localSizes.template setSize< level >( globalSizes.template getSize< level >() );
+            else {
+               TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( OverlapsType{} ), "local size is less than the size of overlaps" );
+               //localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( OverlapsType{} ) );
+               localSizes.template setSize< level >( end - begin );
+            }
+         }
+      );
       localArray.setSize( localSizes );
    }
 
@@ -439,28 +454,6 @@ protected:
    // static sizes should have different type: localBegin is always 0, localEnd is always the full size
    LocalBeginsType localBegins;
    SizesHolderType localEnds;
-
-private:
-   template< std::size_t level >
-   struct LocalSizesSetter
-   {
-      template< typename SizesHolder, typename LocalBegins >
-      static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds )
-      {
-         if( SizesHolder::template getStaticSize< level >() != 0 )
-            return;
-
-         const auto begin = localBegins.template getSize< level >();
-         const auto end = localEnds.template getSize< level >();
-         if( begin == end )
-            localSizes.template setSize< level >( globalSizes.template getSize< level >() );
-         else {
-            TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( OverlapsType{} ), "local size is less than the size of overlaps" );
-            //localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( OverlapsType{} ) );
-            localSizes.template setSize< level >( end - begin );
-         }
-      }
-   };
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
index fcdb728cf..73cdda7a0 100644
--- a/src/TNL/Containers/DistributedNDArraySynchronizer.h
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -156,7 +156,11 @@ public:
          this->mask = mask;
 
          // allocate buffers
-         Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view );
+         Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+            [&] ( auto dim ) {
+               allocateHelper< dim >( buffers, array_view );
+            }
+         );
       }
       else {
          // only bind to the actual data
@@ -239,12 +243,20 @@ protected:
    RequestsVector worker_init()
    {
       // fill send buffers
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            copyHelper< dim >( buffers, array_view, true, mask );
+         }
+      );
 
       // issue all send and receive async operations
       RequestsVector requests;
       const MPI_Comm group = array_view.getCommunicationGroup();
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group, tag_offset, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            sendHelper< dim >( buffers, requests, group, tag_offset, mask );
+         }
+      );
 
       return requests;
    }
@@ -252,170 +264,164 @@ protected:
    void worker_finish()
    {
       // copy data from receive buffers
-      Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false, mask );
+      Algorithms::staticFor< std::size_t, 0, DistributedNDArray::getDimension() >(
+         [&] ( auto dim ) {
+            copyHelper< dim >( buffers, array_view, false, mask );
+         }
+      );
    }
 
    template< std::size_t dim >
-   struct AllocateHelper
+   static void allocateHelper( Buffers& buffers, const DistributedNDArrayView& array_view )
    {
-      static void exec( Buffers& buffers, const DistributedNDArrayView& array_view )
-      {
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 ) {
-            dim_buffers.reset();
-            return;
-         }
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 ) {
+         dim_buffers.reset();
+         return;
+      }
 
-         using LocalBegins = typename DistributedNDArray::LocalBeginsType;
-         using SizesHolder = typename DistributedNDArray::SizesHolderType;
-         const LocalBegins& localBegins = array_view.getLocalBegins();
-         const SizesHolder& localEnds = array_view.getLocalEnds();
+      using LocalBegins = typename DistributedNDArray::LocalBeginsType;
+      using SizesHolder = typename DistributedNDArray::SizesHolderType;
+      const LocalBegins& localBegins = array_view.getLocalBegins();
+      const SizesHolder& localEnds = array_view.getLocalEnds();
 
-         SizesHolder bufferSize( localEnds );
-         bufferSize.template setSize< dim >( overlap );
+      SizesHolder bufferSize( localEnds );
+      bufferSize.template setSize< dim >( overlap );
 
-         // allocate buffers
-         dim_buffers.left_send_buffer.setSize( bufferSize );
-         dim_buffers.left_recv_buffer.setSize( bufferSize );
-         dim_buffers.right_send_buffer.setSize( bufferSize );
-         dim_buffers.right_recv_buffer.setSize( bufferSize );
-
-         // bind views to the buffers
-         dim_buffers.left_send_view.bind( dim_buffers.left_send_buffer.getView() );
-         dim_buffers.left_recv_view.bind( dim_buffers.left_recv_buffer.getView() );
-         dim_buffers.right_send_view.bind( dim_buffers.right_send_buffer.getView() );
-         dim_buffers.right_recv_view.bind( dim_buffers.right_recv_buffer.getView() );
-
-         // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
-
-         // offsets for left-send
-         dim_buffers.left_send_offsets = localBegins;
-
-         // offsets for left-receive
-         dim_buffers.left_recv_offsets = localBegins;
-         dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
-
-         // offsets for right-send
-         dim_buffers.right_send_offsets = localBegins;
-         dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
-
-         // offsets for right-receive
-         dim_buffers.right_recv_offsets = localBegins;
-         dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
-
-         // FIXME: set proper neighbor IDs !!!
-         const MPI_Comm group = array_view.getCommunicationGroup();
-         const int rank = MPI::GetRank(group);
-         const int nproc = MPI::GetSize(group);
-         dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
-         dim_buffers.right_neighbor = (rank + 1) % nproc;
-      }
-   };
+      // allocate buffers
+      dim_buffers.left_send_buffer.setSize( bufferSize );
+      dim_buffers.left_recv_buffer.setSize( bufferSize );
+      dim_buffers.right_send_buffer.setSize( bufferSize );
+      dim_buffers.right_recv_buffer.setSize( bufferSize );
+
+      // bind views to the buffers
+      dim_buffers.left_send_view.bind( dim_buffers.left_send_buffer.getView() );
+      dim_buffers.left_recv_view.bind( dim_buffers.left_recv_buffer.getView() );
+      dim_buffers.right_send_view.bind( dim_buffers.right_send_buffer.getView() );
+      dim_buffers.right_recv_view.bind( dim_buffers.right_recv_buffer.getView() );
+
+      // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
+
+      // offsets for left-send
+      dim_buffers.left_send_offsets = localBegins;
+
+      // offsets for left-receive
+      dim_buffers.left_recv_offsets = localBegins;
+      dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
+
+      // offsets for right-send
+      dim_buffers.right_send_offsets = localBegins;
+      dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
+
+      // offsets for right-receive
+      dim_buffers.right_recv_offsets = localBegins;
+      dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
+
+      // FIXME: set proper neighbor IDs !!!
+      const MPI_Comm group = array_view.getCommunicationGroup();
+      const int rank = MPI::GetRank(group);
+      const int nproc = MPI::GetSize(group);
+      dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
+      dim_buffers.right_neighbor = (rank + 1) % nproc;
+   }
 
    template< std::size_t dim >
-   struct CopyHelper
+   static void copyHelper( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer, SyncDirection mask )
    {
-      static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer, SyncDirection mask )
-      {
-         // skip if there are no overlaps
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 )
-            return;
-
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
-
-         if( buffered ) {
-            // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
-            CopyKernel< decltype(dim_buffers.left_send_view) > copy_kernel;
-            copy_kernel.array_view.bind( array_view );
-            copy_kernel.to_buffer = to_buffer;
-
-            if( to_buffer ) {
-               if( mask & SyncDirection::Left ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.left_send_view );
-                  copy_kernel.array_offsets = dim_buffers.left_send_offsets;
-                  dim_buffers.left_send_view.forAll( copy_kernel );
-               }
-
-               if( mask & SyncDirection::Right ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.right_send_view );
-                  copy_kernel.array_offsets = dim_buffers.right_send_offsets;
-                  dim_buffers.right_send_view.forAll( copy_kernel );
-               }
+      // skip if there are no overlaps
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 )
+         return;
+
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+      if( buffered ) {
+         // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
+         CopyKernel< decltype(dim_buffers.left_send_view) > copy_kernel;
+         copy_kernel.array_view.bind( array_view );
+         copy_kernel.to_buffer = to_buffer;
+
+         if( to_buffer ) {
+            if( mask & SyncDirection::Left ) {
+               copy_kernel.buffer_view.bind( dim_buffers.left_send_view );
+               copy_kernel.array_offsets = dim_buffers.left_send_offsets;
+               dim_buffers.left_send_view.forAll( copy_kernel );
             }
-            else {
-               if( mask & SyncDirection::Right ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.left_recv_view );
-                  copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
-                  dim_buffers.left_recv_view.forAll( copy_kernel );
-               }
-
-               if( mask & SyncDirection::Left ) {
-                  copy_kernel.buffer_view.bind( dim_buffers.right_recv_view );
-                  copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
-                  dim_buffers.right_recv_view.forAll( copy_kernel );
-               }
+
+            if( mask & SyncDirection::Right ) {
+               copy_kernel.buffer_view.bind( dim_buffers.right_send_view );
+               copy_kernel.array_offsets = dim_buffers.right_send_offsets;
+               dim_buffers.right_send_view.forAll( copy_kernel );
             }
          }
          else {
-            // avoid buffering - bind buffer views directly to the array
-            dim_buffers.left_send_view.bind( &call_with_offsets( dim_buffers.left_send_offsets, array_view ) );
-            dim_buffers.left_recv_view.bind( &call_with_offsets( dim_buffers.left_recv_offsets, array_view ) );
-            dim_buffers.right_send_view.bind( &call_with_offsets( dim_buffers.right_send_offsets, array_view ) );
-            dim_buffers.right_recv_view.bind( &call_with_offsets( dim_buffers.right_recv_offsets, array_view ) );
-         }
+            if( mask & SyncDirection::Right ) {
+               copy_kernel.buffer_view.bind( dim_buffers.left_recv_view );
+               copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
+               dim_buffers.left_recv_view.forAll( copy_kernel );
+            }
 
+            if( mask & SyncDirection::Left ) {
+               copy_kernel.buffer_view.bind( dim_buffers.right_recv_view );
+               copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
+               dim_buffers.right_recv_view.forAll( copy_kernel );
+            }
+         }
       }
-   };
+      else {
+         // avoid buffering - bind buffer views directly to the array
+         dim_buffers.left_send_view.bind( &call_with_offsets( dim_buffers.left_send_offsets, array_view ) );
+         dim_buffers.left_recv_view.bind( &call_with_offsets( dim_buffers.left_recv_offsets, array_view ) );
+         dim_buffers.right_send_view.bind( &call_with_offsets( dim_buffers.right_send_offsets, array_view ) );
+         dim_buffers.right_recv_view.bind( &call_with_offsets( dim_buffers.right_recv_offsets, array_view ) );
+      }
+
+   }
 
    template< std::size_t dim >
-   struct SendHelper
+   static void sendHelper( Buffers& buffers, RequestsVector& requests, MPI_Comm group, int tag_offset, SyncDirection mask )
    {
-      template< typename Requests, typename Group >
-      static void exec( Buffers& buffers, Requests& requests, Group group, int tag_offset, SyncDirection mask )
-      {
-         constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
-         if( overlap == 0 )
-            return;
+      constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >();
+      if( overlap == 0 )
+         return;
 
-         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+      auto& dim_buffers = buffers.template getDimBuffers< dim >();
 
-         if( LBM_HACK == false ) {
-            if( mask & SyncDirection::Left ) {
-               requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
-                                               dim_buffers.left_send_view.getStorageSize(),
-                                               dim_buffers.left_neighbor, tag_offset + 0, group ) );
-               requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
-                                               dim_buffers.right_recv_view.getStorageSize(),
-                                               dim_buffers.right_neighbor, tag_offset + 0, group ) );
-            }
-            if( mask & SyncDirection::Right ) {
-               requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
-                                               dim_buffers.right_send_view.getStorageSize(),
-                                               dim_buffers.right_neighbor, tag_offset + 1, group ) );
-               requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
-                                               dim_buffers.left_recv_view.getStorageSize(),
-                                               dim_buffers.left_neighbor, tag_offset + 1, group ) );
-            }
-         }
-         else {
-            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
-                                            dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+      if( LBM_HACK == false ) {
+         if( mask & SyncDirection::Left ) {
+            requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(),
+                                            dim_buffers.left_send_view.getStorageSize(),
                                             dim_buffers.left_neighbor, tag_offset + 0, group ) );
-            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                            dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
-                                            dim_buffers.left_neighbor, tag_offset + 1, group ) );
-            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
-                                            dim_buffers.right_send_view.getStorageSize() / 27 * 9,
-                                            dim_buffers.right_neighbor, tag_offset + 1, group ) );
-            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
-                                            dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+            requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(),
+                                            dim_buffers.right_recv_view.getStorageSize(),
                                             dim_buffers.right_neighbor, tag_offset + 0, group ) );
          }
+         if( mask & SyncDirection::Right ) {
+            requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(),
+                                            dim_buffers.right_send_view.getStorageSize(),
+                                            dim_buffers.right_neighbor, tag_offset + 1, group ) );
+            requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(),
+                                            dim_buffers.left_recv_view.getStorageSize(),
+                                            dim_buffers.left_neighbor, tag_offset + 1, group ) );
+         }
       }
-   };
+      else {
+         requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0,
+                                         dim_buffers.left_send_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.left_neighbor, tag_offset + 0, group ) );
+         requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                         dim_buffers.left_recv_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.left_neighbor, tag_offset + 1, group ) );
+         requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18,
+                                         dim_buffers.right_send_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.right_neighbor, tag_offset + 1, group ) );
+         requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0,
+                                         dim_buffers.right_recv_view.getStorageSize() / 27 * 9,
+                                         dim_buffers.right_neighbor, tag_offset + 0, group ) );
+      }
+   }
 
 #ifdef __NVCC__
 public:
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 1375683b2..97fd8122c 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Cuda/CudaCallable.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
 
@@ -124,48 +124,6 @@ protected:
     }
 };
 
-template< std::size_t dimension >
-struct SizesHolderStaticSizePrinter
-{
-   template< typename SizesHolder >
-   static void exec( std::ostream& str, const SizesHolder& holder )
-   {
-      str << holder.template getStaticSize< dimension >() << ", ";
-   }
-};
-
-template< std::size_t dimension >
-struct SizesHolderSizePrinter
-{
-   template< typename SizesHolder >
-   static void exec( std::ostream& str, const SizesHolder& holder )
-   {
-      str << holder.template getSize< dimension >() << ", ";
-   }
-};
-
-template< std::size_t level >
-struct SizesHolerOperatorPlusHelper
-{
-   template< typename Result, typename LHS, typename RHS >
-   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
-   {
-      if( result.template getStaticSize< level >() == 0 )
-         result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
-   }
-};
-
-template< std::size_t level >
-struct SizesHolerOperatorMinusHelper
-{
-   template< typename Result, typename LHS, typename RHS >
-   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
-   {
-      if( result.template getStaticSize< level >() == 0 )
-         result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
-   }
-};
-
 } // namespace __ndarray_impl
 
 
@@ -231,7 +189,12 @@ SizesHolder< Index, sizes... >
 operator+( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
 {
    SizesHolder< Index, sizes... > result;
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorPlusHelper >::execHost( result, lhs, rhs );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) >(
+      [&result, &lhs, &rhs] ( auto level ) {
+         if( result.template getStaticSize< level >() == 0 )
+            result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
+      }
+   );
    return result;
 }
 
@@ -242,7 +205,12 @@ SizesHolder< Index, sizes... >
 operator-( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
 {
    SizesHolder< Index, sizes... > result;
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorMinusHelper >::execHost( result, lhs, rhs );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) >(
+      [&result, &lhs, &rhs] ( auto level ) {
+         if( result.template getStaticSize< level >() == 0 )
+            result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
+      }
+   );
    return result;
 }
 
@@ -295,9 +263,17 @@ template< typename Index,
 std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
 {
    str << "SizesHolder< ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getStaticSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
 }
@@ -360,10 +336,18 @@ template< typename Index,
 std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... >, ConstValue >& holder )
 {
    str << "LocalBeginsHolder< SizesHolder< ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getStaticSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >, ";
    str << ConstValue << " >( ";
-   Algorithms::TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   Algorithms::staticFor< std::size_t, 0, sizeof...(sizes) - 1 >(
+      [&str, &holder] ( auto dimension ) {
+         str << holder.template getSize< dimension >() << ", ";
+      }
+   );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
 }
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index 4e5473c70..fc835d13f 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -15,7 +15,7 @@
 #include <algorithm>
 
 #include <TNL/Assert.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Containers/ndarray/Meta.h>
 
 namespace TNL {
@@ -209,18 +209,6 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 >
 };
 
 
-template< std::size_t level >
-struct WeakCompareHelper
-{
-   template< typename SizesHolder1,
-             typename SizesHolder2 >
-   __cuda_callable__
-   static void exec( const SizesHolder1& sizes1, const SizesHolder2& sizes2, bool& result )
-   {
-      result &= sizes1.template getSize< level >() == sizes2.template getSize< level >();
-   }
-};
-
 // helper for the assignment operator in NDArrayView
 template< typename SizesHolder1,
           typename SizesHolder2 >
@@ -230,7 +218,11 @@ bool sizesWeakCompare( const SizesHolder1& sizes1, const SizesHolder2& sizes2 )
    static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
                   "Cannot compare sizes of different dimensions." );
    bool result = true;
-   Algorithms::TemplateStaticFor< std::size_t, 0, SizesHolder1::getDimension(), WeakCompareHelper >::exec( sizes1, sizes2, result );
+   Algorithms::staticFor< std::size_t, 0, SizesHolder1::getDimension() >(
+      [&result, &sizes1, &sizes2] ( auto level ) {
+         result = result && sizes1.template getSize< level >() == sizes2.template getSize< level >();
+      }
+   );
    return result;
 }
 
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
index 840a201c6..ba619bc92 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter1D_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +36,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -46,12 +46,12 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -65,7 +65,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -79,10 +79,10 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
@@ -107,7 +107,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -117,14 +117,14 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       static constexpr int stencilSize = Config::getStencilSize();
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -138,7 +138,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntityType( this->entity.getMesh(), CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -157,33 +157,25 @@ class NeighborGridEntityGetter<
 #else
          return this->entity.getIndex() + step;
 #endif
- 
+
       }
- 
-      template< IndexType index >
-      class StencilRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencil[ index + stencilSize ] = entityIndex + index;
-            }
-      };
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA  // TODO: fix it -- does not work with nvcc
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilRefresher >::exec( *this, entityIndex );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >(
+            [&] ( auto index ) {
+               stencil[ index + stencilSize ] = entityIndex + index;
+            }
+         );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencil[ 2 * stencilSize + 1 ];
 };
 
@@ -204,7 +196,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -214,12 +206,12 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -233,7 +225,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step + ( step < 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -247,16 +239,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step + ( step < 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 /****
@@ -277,7 +269,7 @@ class NeighborGridEntityGetter<
    StencilStorage > //GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -287,14 +279,14 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       void test() const { std::cerr << "***" << std::endl; };
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -308,7 +300,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step - ( step > 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -322,7 +314,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step - ( step > 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -348,7 +340,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -358,13 +350,13 @@ class NeighborGridEntityGetter<
       typedef Index IndexType;
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
 
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -378,7 +370,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step - ( step > 0 ) ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -392,7 +384,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + step - ( step > 0 );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -419,7 +411,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 1, Real, Device, Index > GridType;
@@ -434,7 +426,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int step >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -448,7 +440,7 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return NeighborGridEntity( CoordinatesType( entity.getCoordinates().x() + step ) );
       }
- 
+
       template< int step >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -463,11 +455,11 @@ class NeighborGridEntityGetter<
 
          return this->entity.getIndex() + step;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
- 
+
    protected:
 
       const GridEntityType& entity;
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
index 1591a523e..00286663a 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter2D_impl.h
@@ -14,6 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +37,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -51,7 +52,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -67,7 +68,7 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX,
                                                           entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -81,14 +82,14 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepY * entity.getMesh().getDimensions().x() + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -109,7 +110,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -120,14 +121,14 @@ class NeighborGridEntityGetter<
       typedef typename GridType::CoordinatesType CoordinatesType;
       typedef GridEntityGetter< GridType, NeighborGridEntityType > GridEntityGetterType;
       typedef GridEntityStencilStorageTag< GridEntityCrossStencil > StencilStorage;
- 
+
       static constexpr int stencilSize = Config::getStencilSize();
 
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -143,7 +144,7 @@ class NeighborGridEntityGetter<
                                             CoordinatesType( entity.getCoordinates().x() + stepX,
                                                              entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -166,52 +167,33 @@ class NeighborGridEntityGetter<
 #else
          return this->entity.getIndex() + stepY * entity.getMesh().getDimensions().x() + stepX;
 #endif
- 
-      }
- 
-      template< IndexType index >
-      class StencilXRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilX[ index + stencilSize ] = entityIndex + index;
-            }
-      };
 
-      template< IndexType index >
-      class StencilYRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilY[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().getDimensions().x();
-            }
-      };
+      }
 
- 
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA // TODO: fix this to work with CUDA
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilXRefresher >::exec( *this, entityIndex );
+         auto stencilXRefresher = [&] ( auto index ) {
+            stencilX[ index + stencilSize ] = entityIndex + index;
+         };
+         auto stencilYRefresher = [&] ( auto index ) {
+            stencilY[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().getDimensions().x();
+         };
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >( stencilXRefresher );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencilX[ 2 * stencilSize + 1 ];
       IndexType stencilY[ 2 * stencilSize + 1 ];
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -233,7 +215,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -250,7 +232,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -276,14 +258,14 @@ class NeighborGridEntityGetter<
                                                                 stepY ? (stepY > 0 ? 1 : -1) : 0 ),
                                          EntityBasisType( ! stepX, ! stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
@@ -310,7 +292,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -325,7 +307,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -348,21 +330,21 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX + ( stepX < 0 ),
                                                           entity.getCoordinates().y() + stepY + ( stepY < 0 ) ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -384,7 +366,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 1;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -399,7 +381,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -424,17 +406,17 @@ class NeighborGridEntityGetter<
                      CoordinatesType( entity.getCoordinates().x() + stepX - ( stepX > 0 ) * ( entity.getOrientation().x() != 0.0 ),
                                       entity.getCoordinates().y() + stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ) ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), this->template getEntity< stepX, stepY >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
@@ -458,7 +440,7 @@ class NeighborGridEntityGetter<
    StencilStorage >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 2, Real, Device, Index > GridType;
@@ -473,7 +455,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -489,7 +471,7 @@ class NeighborGridEntityGetter<
                                          CoordinatesType( entity.getCoordinates().x() + stepX,
                                                           entity.getCoordinates().y() + stepY ) );
       }
- 
+
       template< int stepX, int stepY >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -503,14 +485,14 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepY * ( entity.getMesh().getDimensions().x() + 1 ) + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
index 7c260f577..1a377490b 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter3D_impl.h
@@ -14,7 +14,7 @@
 #include <TNL/Meshes/GridDetails/Grid1D.h>
 #include <TNL/Meshes/GridDetails/Grid2D.h>
 #include <TNL/Meshes/GridDetails/Grid3D.h>
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 
 namespace TNL {
 namespace Meshes {
@@ -36,7 +36,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -51,7 +51,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -68,7 +68,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -83,16 +83,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + ( stepZ * entity.getMesh().getDimensions().y() + stepY ) * entity.getMesh().getDimensions().x() + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 
@@ -113,7 +113,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -126,12 +126,12 @@ class NeighborGridEntityGetter<
       typedef GridEntityStencilStorageTag< GridEntityCrossStencil > StencilStorage;
 
       static constexpr int stencilSize = Config::getStencilSize();
- 
+
       __cuda_callable__ inline
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -148,7 +148,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -179,66 +179,38 @@ class NeighborGridEntityGetter<
 #endif
 
       }
- 
-      template< IndexType index >
-      class StencilXRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilX[ index + stencilSize ] = entityIndex + index;
-            }
-      };
-
-      template< IndexType index >
-      class StencilYRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilY[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().getDimensions().x();
-            }
-      };
- 
-      template< IndexType index >
-      class StencilZRefresher
-      {
-         public:
- 
-            __cuda_callable__
-            static void exec( NeighborGridEntityGetter& neighborEntityGetter, const IndexType& entityIndex )
-            {
-               neighborEntityGetter.stencilZ[ index + stencilSize ] =
-                  entityIndex + index * neighborEntityGetter.entity.getMesh().cellZNeighborsStep;
-            }
-      };
 
- 
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex )
       {
 #ifndef HAVE_CUDA // TODO: fix this to work with CUDA
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilZRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilZRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, 0, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, 1, stencilSize + 1, StencilYRefresher >::exec( *this, entityIndex );
-         Algorithms::TemplateStaticFor< IndexType, -stencilSize, stencilSize + 1, StencilXRefresher >::exec( *this, entityIndex );
+         auto stencilXRefresher = [&] ( auto index ) {
+            stencilX[ index + stencilSize ] = entityIndex + index;
+         };
+         auto stencilYRefresher = [&] ( auto index ) {
+            stencilY[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().getDimensions().x();
+         };
+         auto stencilZRefresher = [&] ( auto index ) {
+            stencilZ[ index + stencilSize ] =
+               entityIndex + index * entity.getMesh().cellZNeighborsStep;
+         };
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilZRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilZRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, 0 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, 1, stencilSize + 1 >( stencilYRefresher );
+         Algorithms::staticFor< IndexType, -stencilSize, stencilSize + 1 >( stencilXRefresher );
 #endif
       };
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       IndexType stencilX[ 2 * stencilSize + 1 ];
       IndexType stencilY[ 2 * stencilSize + 1 ];
       IndexType stencilZ[ 2 * stencilSize + 1 ];
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -259,7 +231,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -276,7 +248,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -307,21 +279,21 @@ class NeighborGridEntityGetter<
                                                                stepZ ? (stepZ > 0 ? 1 : -1) : 0 ),
                                         EntityBasisType( ! stepX, !stepY, !stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -342,7 +314,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityCrossStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 2;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -359,7 +331,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -390,21 +362,21 @@ class NeighborGridEntityGetter<
                                                                stepZ ? (stepZ > 0 ? 1 : -1) : 0 ),
                                         EntityBasisType( ! stepX, !stepY, !stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -426,7 +398,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 1;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -443,7 +415,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -472,21 +444,21 @@ class NeighborGridEntityGetter<
                                         EntityOrientationType( !!stepX, !!stepY, !!stepZ ),
                                         EntityBasisType( !stepX, !stepY, !stepZ ));
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( this->entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
- 
+
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -508,7 +480,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 3;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -523,7 +495,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY,int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -554,21 +526,21 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY + ( stepY < 0 ),
                                                          entity.getCoordinates().z() + stepZ + ( stepZ < 0 ) ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -589,7 +561,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 2;
       static constexpr int NeighborEntityDimension = 3;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -604,7 +576,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -637,21 +609,21 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY - ( stepY > 0 ) * ( entity.getOrientation().y() != 0.0 ),
                                                          entity.getCoordinates().z() + stepZ - ( stepZ > 0 ) * ( entity.getOrientation().z() != 0.0 ) ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
       {
          return GridEntityGetterType::getEntityIndex( entity.getMesh(), getEntity< stepX, stepY, stepZ >() );
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
 };
 
@@ -672,7 +644,7 @@ class NeighborGridEntityGetter<
    GridEntityStencilStorageTag< GridEntityNoStencil > >
 {
    public:
- 
+
       static constexpr int EntityDimension = 0;
       static constexpr int NeighborEntityDimension = 0;
       typedef Meshes::Grid< 3, Real, Device, Index > GridType;
@@ -687,7 +659,7 @@ class NeighborGridEntityGetter<
       NeighborGridEntityGetter( const GridEntityType& entity )
       : entity( entity )
       {}
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       NeighborGridEntityType getEntity() const
@@ -705,7 +677,7 @@ class NeighborGridEntityGetter<
                                                          entity.getCoordinates().y() + stepY,
                                                          entity.getCoordinates().z() + stepZ ) );
       }
- 
+
       template< int stepX, int stepY, int stepZ >
       __cuda_callable__ inline
       IndexType getEntityIndex() const
@@ -720,16 +692,16 @@ class NeighborGridEntityGetter<
                    << " EntityDimension = " << EntityDimension );
          return this->entity.getIndex() + stepZ * ( entity.getMesh().getDimensions().y() + 1 + stepY ) * ( entity.getMesh().getDimensions().x() + 1 ) + stepX;
       }
- 
+
       __cuda_callable__
       void refresh( const GridType& grid, const IndexType& entityIndex ){};
 
    protected:
 
       const GridEntityType& entity;
- 
+
       //NeighborGridEntityGetter(){};
- 
+
 };
 
 } // namespace Meshes
diff --git a/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h b/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
index 71e9fab0e..9195a2e2a 100644
--- a/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
+++ b/src/TNL/Meshes/MeshDetails/IndexPermutationApplier.h
@@ -28,7 +28,7 @@ private:
                 Mesh::MeshTraitsType::template SubentityTraits< typename Mesh::template EntityType< Dimension >::EntityTopology,
                                                                 Subdimension >::storageEnabled
              >
-   struct _SubentitiesStorageWorker
+   struct SubentitiesStorageWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& perm )
       {
@@ -38,7 +38,7 @@ private:
    };
 
    template< int Subdimension >
-   struct _SubentitiesStorageWorker< Subdimension, false >
+   struct SubentitiesStorageWorker< Subdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -49,7 +49,7 @@ private:
                 Mesh::MeshTraitsType::template SuperentityTraits< typename Mesh::template EntityType< Dimension >::EntityTopology,
                                                                   Superdimension >::storageEnabled
              >
-   struct _SuperentitiesStorageWorker
+   struct SuperentitiesStorageWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& perm )
       {
@@ -60,7 +60,7 @@ private:
    };
 
    template< int Superdimension >
-   struct _SuperentitiesStorageWorker< Superdimension, false >
+   struct SuperentitiesStorageWorker< Superdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -71,7 +71,7 @@ private:
                 Mesh::MeshTraitsType::template SuperentityTraits< typename Mesh::template EntityType< Subdimension >::EntityTopology,
                                                                   Dimension >::storageEnabled
              >
-   struct IndexPermutationApplierSubentitiesWorker
+   struct SubentitiesWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm )
       {
@@ -81,7 +81,7 @@ private:
    };
 
    template< int Subdimension >
-   struct IndexPermutationApplierSubentitiesWorker< Subdimension, false >
+   struct SubentitiesWorker< Subdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
@@ -92,7 +92,7 @@ private:
                 Mesh::MeshTraitsType::template SubentityTraits< typename Mesh::template EntityType< Superdimension >::EntityTopology,
                                                                 Dimension >::storageEnabled
              >
-   struct IndexPermutationApplierSuperentitiesWorker
+   struct SuperentitiesWorker
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm )
       {
@@ -102,25 +102,12 @@ private:
    };
 
    template< int Superdimension >
-   struct IndexPermutationApplierSuperentitiesWorker< Superdimension, false >
+   struct SuperentitiesWorker< Superdimension, false >
    {
       static void exec( Mesh& mesh, const GlobalIndexArray& iperm ) {}
    };
 
 
-   // template aliases needed to hide the 'Enabled' parameter
-   template< int Subdimension >
-   using SubentitiesStorageWorker = _SubentitiesStorageWorker< Subdimension >;
-
-   template< int Superdimension >
-   using SuperentitiesStorageWorker = _SuperentitiesStorageWorker< Superdimension >;
-
-   template< int Subdimension >
-   using SubentitiesWorker = IndexPermutationApplierSubentitiesWorker< Subdimension >;
-
-   template< int Superdimension >
-   using SuperentitiesWorker = IndexPermutationApplierSuperentitiesWorker< Superdimension >;
-
    template< typename Mesh_, std::enable_if_t< Mesh_::Config::dualGraphStorage(), bool > = true >
    static void permuteDualGraph( Mesh_& mesh, const GlobalIndexArray& perm, const GlobalIndexArray& iperm )
    {
@@ -183,17 +170,33 @@ public:
       if( Dimension == 0 )
          permuteArray( mesh.getPoints(), perm );
 
-      // permute superentities storage
-      Algorithms::TemplateStaticFor< int, 0, Dimension, SubentitiesStorageWorker >::execHost( mesh, perm );
-
       // permute subentities storage
-      Algorithms::TemplateStaticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1, SuperentitiesStorageWorker >::execHost( mesh, perm );
+      Algorithms::staticFor< int, 0, Dimension >(
+         [&] ( auto dim ) {
+            SubentitiesStorageWorker< dim >::exec( mesh, perm );
+         }
+      );
+
+      // permute superentities storage
+      Algorithms::staticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1 >(
+         [&] ( auto dim ) {
+            SuperentitiesStorageWorker< dim >::exec( mesh, perm );
+         }
+      );
 
       // update superentity indices from the subentities
-      Algorithms::TemplateStaticFor< int, 0, Dimension, SubentitiesWorker >::execHost( mesh, iperm );
+      Algorithms::staticFor< int, 0, Dimension >(
+         [&] ( auto dim ) {
+            SubentitiesWorker< dim >::exec( mesh, iperm );
+         }
+      );
 
       // update subentity indices from the superentities
-      Algorithms::TemplateStaticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1, SuperentitiesWorker >::execHost( mesh, iperm );
+      Algorithms::staticFor< int, Dimension + 1, Mesh::getMeshDimension() + 1 >(
+         [&] ( auto dim ) {
+            SuperentitiesWorker< dim >::exec( mesh, iperm );
+         }
+      );
 
       if( Dimension == Mesh::getMeshDimension() ) {
          // permute dual graph
diff --git a/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h b/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
index 1e2edb332..52dcc73c7 100644
--- a/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
+++ b/src/TNL/Meshes/MeshDetails/initializer/SubentitySeedsCreator.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Meshes/MeshDetails/traits/MeshTraits.h>
 
 namespace TNL {
@@ -43,35 +43,20 @@ public:
    static SubentitySeedArray create( const SubvertexAccessorType& subvertices )
    {
       SubentitySeedArray subentitySeeds;
-      Algorithms::TemplateStaticFor< LocalIndexType, 0, SubentitySeedArray::getSize(), CreateSubentitySeeds >::execHost( subentitySeeds, subvertices );
+      Algorithms::staticFor< LocalIndexType, 0, SubentitySeedArray::getSize() >(
+         [&] ( auto subentityIndex ) {
+            Algorithms::staticFor< LocalIndexType, 0, SUBENTITY_VERTICES_COUNT >(
+               [&] ( auto subentityVertexIndex ) {
+                  // subentityIndex cannot be captured as constexpr, so we need to create another instance of its type
+                  static constexpr LocalIndexType VERTEX_INDEX = SubentityTraits::template Vertex< decltype(subentityIndex){}, subentityVertexIndex >::index;
+                  subentitySeeds[ subentityIndex ].setCornerId( subentityVertexIndex, subvertices.getColumnIndex( VERTEX_INDEX ) );
+               }
+            );
+         }
+      );
 
       return subentitySeeds;
    }
-
-private:
-   using SubentitySeed = EntitySeed< MeshConfig, SubentityTopology >;
-
-   template< LocalIndexType subentityIndex >
-   class CreateSubentitySeeds
-   {
-      public:
-         static void exec( SubentitySeedArray& subentitySeeds, const SubvertexAccessorType& subvertices )
-         {
-            Algorithms::TemplateStaticFor< LocalIndexType, 0, SUBENTITY_VERTICES_COUNT, SetSubentitySeedVertex >::execHost( subentitySeeds[ subentityIndex ], subvertices );
-         }
-
-      private:
-         template< LocalIndexType subentityVertexIndex >
-         class SetSubentitySeedVertex
-         {
-            public:
-               static void exec( SubentitySeed& subentitySeed, const SubvertexAccessorType& subvertices )
-               {
-                  static constexpr LocalIndexType VERTEX_INDEX = SubentityTraits::template Vertex< subentityIndex, subentityVertexIndex >::index;
-                  subentitySeed.setCornerId( subentityVertexIndex, subvertices.getColumnIndex( VERTEX_INDEX ) );
-               }
-         };
-   };
 };
 
 template< typename MeshConfig,
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
index 95e29182e..55d55890f 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Initializer.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/TemplateStaticFor.h>
+#include <TNL/Algorithms/staticFor.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Pointers/DevicePointer.h>
 #include <TNL/Meshes/DimensionTag.h>
@@ -47,15 +47,6 @@ protected:
       static constexpr bool value = MeshConfig::entityTagsStorage( EntityTopology() );
    };
 
-   template< int Dimension >
-   struct SetEntitiesCount
-   {
-      static void exec( Mesh& mesh )
-      {
-         mesh.template entityTagsSetEntitiesCount< Dimension >( mesh.template getEntitiesCount< Dimension >() );
-      }
-   };
-
    template< int Dimension >
    class ResetEntityTags
    {
@@ -123,15 +114,6 @@ protected:
       }
    };
 
-   template< int Dimension >
-   struct UpdateEntityTagsLayer
-   {
-      static void exec( Mesh& mesh )
-      {
-         mesh.template updateEntityTagsLayer< Dimension >();
-      }
-   };
-
 // nvcc does not allow __cuda_callable__ lambdas inside private or protected sections
 #ifdef __NVCC__
 public:
@@ -144,8 +126,19 @@ public:
    public:
       static void exec( Mesh& mesh )
       {
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, SetEntitiesCount >::execHost( mesh );
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, ResetEntityTags >::execHost( mesh );
+         // set entities count
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               mesh.template entityTagsSetEntitiesCount< dim >( mesh.template getEntitiesCount< dim >() );
+            }
+         );
+
+         // reset entity tags
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               ResetEntityTags< dim >::exec( mesh );
+            }
+         );
 
          auto kernel = [] __cuda_callable__
             ( GlobalIndexType faceIndex,
@@ -159,7 +152,11 @@ public:
                const GlobalIndexType cellIndex = face.template getSuperentityIndex< Mesh::getMeshDimension() >( 0 );
                mesh->template addEntityTag< Mesh::getMeshDimension() >( cellIndex, EntityTags::BoundaryEntity );
                // initialize all subentities
-               Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() - 1, InitializeSubentities >::exec( *mesh, faceIndex, face );
+               Algorithms::staticFor< int, 0, Mesh::getMeshDimension() - 1 >(
+                  [&mesh, faceIndex, &face] ( auto dim ) {
+                     InitializeSubentities< dim >::exec( *mesh, faceIndex, face );
+                  }
+               );
             }
          };
 
@@ -169,7 +166,12 @@ public:
                                                       kernel,
                                                       &meshPointer.template modifyData< DeviceType >() );
 
-         Algorithms::TemplateStaticFor< int, 0, Mesh::getMeshDimension() + 1, UpdateEntityTagsLayer >::execHost( mesh );
+         // update entity tags
+         Algorithms::staticFor< int, 0, Mesh::getMeshDimension() + 1 >(
+            [&mesh] ( auto dim ) {
+               mesh.template updateEntityTagsLayer< dim >();
+            }
+         );
       }
    };
 
-- 
GitLab


From 82605ea815a9e38c8ace40ad84012908a85321e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 3 Apr 2021 23:50:34 +0200
Subject: [PATCH 07/13] Tests: implemented consistency checks between Mesh and
 all of its entities

---
 src/UnitTests/Meshes/EntityTests.h | 114 +++++++++++++++++++++++++++++
 src/UnitTests/Meshes/MeshTest.h    |   3 +
 2 files changed, 117 insertions(+)
 create mode 100644 src/UnitTests/Meshes/EntityTests.h

diff --git a/src/UnitTests/Meshes/EntityTests.h b/src/UnitTests/Meshes/EntityTests.h
new file mode 100644
index 000000000..109e6d5d9
--- /dev/null
+++ b/src/UnitTests/Meshes/EntityTests.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Algorithms/staticFor.h>
+#include <TNL/Meshes/MeshEntity.h>
+
+namespace EntityTests {
+
+template< typename MeshEntity >
+void testVertex( const MeshEntity& entity )
+{}
+
+template< typename MeshConfig, typename Device >
+void testVertex( const TNL::Meshes::MeshEntity< MeshConfig, Device, TNL::Meshes::Topologies::Vertex >& entity )
+{
+   EXPECT_EQ( entity.getPoint(), entity.getMesh().getPoint( entity.getIndex() ) );
+}
+
+template< int subdimension, typename MeshEntity >
+void testSubentities( const MeshEntity& entity )
+{
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   const auto meshSubentitiesCount = mesh.template getSubentitiesCount< dimension, subdimension >( index );
+   ASSERT_EQ( entity.template getSubentitiesCount< subdimension >(), meshSubentitiesCount );
+   for( int i = 0; i < entity.template getSubentitiesCount< subdimension >(); i++ ) {
+      const auto meshSubentityIndex = mesh.template getSubentityIndex< dimension, subdimension >( index, i );
+      EXPECT_EQ( entity.template getSubentityIndex< subdimension >( i ), meshSubentityIndex );
+   }
+}
+
+template< int superdimension, typename MeshEntity >
+void testSuperentities( const MeshEntity& entity )
+{
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   const auto meshSuperentitiesCount = mesh.template getSuperentitiesCount< dimension, superdimension >( index );
+   ASSERT_EQ( entity.template getSuperentitiesCount< superdimension >(), meshSuperentitiesCount );
+   for( int i = 0; i < entity.template getSuperentitiesCount< superdimension >(); i++ ) {
+      const auto meshSuperentityIndex = mesh.template getSuperentityIndex< dimension, superdimension >( index, i );
+      EXPECT_EQ( entity.template getSuperentityIndex< superdimension >( i ), meshSuperentityIndex );
+   }
+}
+
+// test if the entity is consistent with its mesh (i.e. all member functions like
+// getSubentityIndex return the same value when called from the entity and the mesh)
+template< typename MeshEntity >
+void testEntity( const MeshEntity& entity )
+{
+   // static tests for the MeshEntity type
+   static_assert( std::is_constructible< MeshEntity, typename MeshEntity::MeshType, typename MeshEntity::GlobalIndexType >::value,
+                  "MeshEntity should be constructible from its MeshType and GlobalIndexType" );
+   static_assert( ! std::is_default_constructible< MeshEntity >::value,
+                  "MeshEntity should not be default-constructible" );
+   static_assert( std::is_copy_constructible< MeshEntity >::value,
+                  "MeshEntity should be copy-constructible" );
+   static_assert( std::is_move_constructible< MeshEntity >::value,
+                  "MeshEntity should be move-constructible" );
+   static_assert( std::is_copy_assignable< MeshEntity >::value,
+                  "MeshEntity should be copy-assignable" );
+   static_assert( std::is_move_assignable< MeshEntity >::value,
+                  "MeshEntity should be move-assignable" );
+   static_assert( std::is_trivially_destructible< MeshEntity >::value,
+                  "MeshEntity should be trivially destructible" );
+
+   // dynamic tests for the entity
+   const typename MeshEntity::MeshType mesh = entity.getMesh();
+   const typename MeshEntity::GlobalIndexType index = entity.getIndex();
+   constexpr int dimension = MeshEntity::getEntityDimension();
+
+   testVertex( entity );
+   EXPECT_EQ( entity.getTag(), mesh.template getEntityTag< dimension >( index ) );
+
+   TNL::Algorithms::staticFor< int, 0, dimension >(
+      [&entity] ( auto subdimension ) {
+         testSubentities< subdimension >( entity );
+      }
+   );
+   TNL::Algorithms::staticFor< int, dimension + 1, MeshEntity::MeshType::getMeshDimension() + 1 >(
+      [&entity] ( auto superdimension ) {
+         testSuperentities< superdimension >( entity );
+      }
+   );
+}
+
+template< int Dimension, typename Mesh >
+void testEntities( const Mesh& mesh )
+{
+   using Index = typename Mesh::GlobalIndexType;
+   const Index entitiesCount = mesh.template getEntitiesCount< Dimension >();
+   for( Index i = 0; i < entitiesCount; i++ ) {
+      const auto entity = mesh.template getEntity< Dimension >( i );
+      testEntity( entity );
+   }
+}
+
+} // EntityTests
+
+template< typename Mesh >
+void testEntities( const Mesh& mesh )
+{
+   TNL::Algorithms::staticFor< int, 0, Mesh::getMeshDimension() >(
+      [&mesh] ( auto Dimension ) {
+         EntityTests::testEntities< Dimension >( mesh );
+      }
+   );
+}
+#endif
diff --git a/src/UnitTests/Meshes/MeshTest.h b/src/UnitTests/Meshes/MeshTest.h
index e90161b96..788c2172a 100644
--- a/src/UnitTests/Meshes/MeshTest.h
+++ b/src/UnitTests/Meshes/MeshTest.h
@@ -16,6 +16,8 @@
 #include <TNL/Meshes/Topologies/Hexahedron.h>
 #include <TNL/Meshes/MeshBuilder.h>
 
+#include "EntityTests.h"
+
 namespace MeshTest {
 
 using namespace TNL;
@@ -130,6 +132,7 @@ void testFinishedMesh( const Mesh& mesh )
    compareStringRepresentation( mesh, mesh2 );
    testCopyAssignment( mesh );
    testMeshOnCuda( mesh );
+   testEntities( mesh );
 }
 
 TEST( MeshTest, TwoTrianglesTest )
-- 
GitLab


From 3db0331a78980060026704c72527219959ba2cfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 15:27:38 +0200
Subject: [PATCH 08/13] Removed StaticVectorFor from the Algorithms namespace

It is an implementation detail specific to the CutMeshFunction class,
not something that should be part of the public interface. There are
many problems with this approach: e.g. it does not scale to higher
dimensions and it is not parallel.
---
 src/TNL/Algorithms/StaticVectorFor.h | 53 ----------------------------
 src/TNL/Functions/CutMeshFunction.h  | 35 ++++++++++++++++--
 2 files changed, 33 insertions(+), 55 deletions(-)
 delete mode 100644 src/TNL/Algorithms/StaticVectorFor.h

diff --git a/src/TNL/Algorithms/StaticVectorFor.h b/src/TNL/Algorithms/StaticVectorFor.h
deleted file mode 100644
index 664f97aed..000000000
--- a/src/TNL/Algorithms/StaticVectorFor.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/***************************************************************************
-                          StaticVectorFor.h  -  description
-                             -------------------
-    begin                : July 12, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Containers/StaticVector.h>
-
-namespace TNL {
-namespace Algorithms {
-
-struct StaticVectorFor
-{
-   template< typename Index,
-             typename Function,
-             typename... FunctionArgs,
-             int dim >
-   static void exec( const Containers::StaticVector< dim, Index >& begin,
-                     const Containers::StaticVector< dim, Index >& end,
-                     Function f,
-                     FunctionArgs... args )
-   {
-      static_assert( 1 <= dim && dim <= 3, "unsupported dimension" );
-      Containers::StaticVector< dim, Index > index;
-
-      if( dim == 1 ) {
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-            f( index, args... );
-      }
-
-      if( dim == 2 ) {
-         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-               f( index, args... );
-      }
-
-      if( dim == 3 ) {
-         for( index[2] = begin[2]; index[2] < end[2]; index[2]++ )
-         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
-         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
-            f( index, args... );
-      }
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h
index b9ec101cf..66c585252 100644
--- a/src/TNL/Functions/CutMeshFunction.h
+++ b/src/TNL/Functions/CutMeshFunction.h
@@ -10,7 +10,6 @@
 
 #pragma once
 
-#include <TNL/Algorithms/StaticVectorFor.h>
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
@@ -22,6 +21,38 @@ template <  typename MeshFunctionType,
             int codimension=MeshFunctionType::getMeshDimension()-OutMesh::getMeshDimension()>
 class CutMeshFunction
 {
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs,
+             int dim >
+   static void
+   staticVectorFor( const Containers::StaticVector< dim, Index >& begin,
+                    const Containers::StaticVector< dim, Index >& end,
+                    Function f,
+                    FunctionArgs... args )
+   {
+      static_assert( 1 <= dim && dim <= 3, "unsupported dimension" );
+      Containers::StaticVector< dim, Index > index;
+
+      if( dim == 1 ) {
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+
+      if( dim == 2 ) {
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+               f( index, args... );
+      }
+
+      if( dim == 3 ) {
+         for( index[2] = begin[2]; index[2] < end[2]; index[2]++ )
+         for( index[1] = begin[1]; index[1] < end[1]; index[1]++ )
+         for( index[0] = begin[0]; index[0] < end[0]; index[0]++ )
+            f( index, args... );
+      }
+   }
+
   public:
     static bool Cut(MeshFunctionType &inputMeshFunction,
                     OutMesh &outMesh,
@@ -99,7 +130,7 @@ class CutMeshFunction
 
             typename OutMesh::CoordinatesType starts;
             starts.setValue(0);
-            Algorithms::StaticVectorFor::exec(starts,outMesh.getDimensions(),kernel);
+            staticVectorFor(starts,outMesh.getDimensions(),kernel);
         }
 
         return inCut;
-- 
GitLab


From 562903aab9bb2e06a22f8a584da0057fd4d02a06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 16:56:57 +0200
Subject: [PATCH 09/13] Reimplemented UnrolledFor as a free function
 unrolledFor

---
 .../Examples/Algorithms/CMakeLists.txt        |   6 +-
 ...dForExample.cpp => unrolledForExample.cpp} |  15 +--
 .../Tutorials/ForLoops/tutorial_ForLoops.md   |  24 ++--
 src/TNL/Algorithms/UnrolledFor.h              |  88 -------------
 src/TNL/Algorithms/unrolledFor.h              | 100 ++++++++++++++
 src/TNL/Containers/StaticArray.hpp            |  40 ++++--
 src/TNL/Containers/StaticVector.hpp           |   6 +-
 .../Containers/detail/StaticArrayAssignment.h |  43 +++---
 src/UnitTests/Algorithms/CMakeLists.txt       |   1 +
 src/UnitTests/Algorithms/unrolledForTest.cpp  |   1 +
 src/UnitTests/Algorithms/unrolledForTest.cu   |   1 +
 src/UnitTests/Algorithms/unrolledForTest.h    | 124 ++++++++++++++++++
 12 files changed, 301 insertions(+), 148 deletions(-)
 rename Documentation/Examples/Algorithms/{UnrolledForExample.cpp => unrolledForExample.cpp} (69%)
 delete mode 100644 src/TNL/Algorithms/UnrolledFor.h
 create mode 100644 src/TNL/Algorithms/unrolledFor.h
 create mode 100644 src/UnitTests/Algorithms/unrolledForTest.cpp
 create mode 100644 src/UnitTests/Algorithms/unrolledForTest.cu
 create mode 100644 src/UnitTests/Algorithms/unrolledForTest.h

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 87b544683..294006c08 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -9,11 +9,11 @@ ENDIF()
 ADD_EXECUTABLE(staticForExample staticForExample.cpp)
 ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out )
 
-ADD_EXECUTABLE(UnrolledForExample UnrolledForExample.cpp)
-ADD_CUSTOM_COMMAND( COMMAND UnrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UnrolledForExample.out OUTPUT UnrolledForExample.out )
+ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp)
+ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out )
 
 ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    ParallelForExample.out
-   UnrolledForExample.out
+   unrolledForExample.out
    staticForExample.out
 )
diff --git a/Documentation/Examples/Algorithms/UnrolledForExample.cpp b/Documentation/Examples/Algorithms/unrolledForExample.cpp
similarity index 69%
rename from Documentation/Examples/Algorithms/UnrolledForExample.cpp
rename to Documentation/Examples/Algorithms/unrolledForExample.cpp
index 05e438cfd..912029e3e 100644
--- a/Documentation/Examples/Algorithms/UnrolledForExample.cpp
+++ b/Documentation/Examples/Algorithms/unrolledForExample.cpp
@@ -1,6 +1,6 @@
 #include <iostream>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/UnrolledFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
@@ -19,13 +19,12 @@ int main( int argc, char* argv[] )
    /****
     * Compute an addition of a vector and a constant number.
     */
-   auto addition = [&]( int i, const double& c )
-   {
-      a[ i ] = b[ i ] + c;
-      sum += a[ i ];
-   };
-   Algorithms::UnrolledFor< 0, Size >::exec( addition, 3.14 );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&]( int i ) {
+         a[ i ] = b[ i ] + 3.14;
+         sum += a[ i ];
+      }
+   );
    std::cout << "a = " << a << std::endl;
    std::cout << "sum = " << sum << std::endl;
 }
-
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 9e1663102..1284783e6 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -65,24 +65,24 @@ For completeness, we show modification of the previous example into 3D:
 
 ## Unrolled For
 
-\ref TNL::Algorithms::UnrolledFor is a for-loop that it is explicitly unrolled via C++ templates when the loop is short (up to eight iterations).
-The bounds of `UnrolledFor` loops must be constant (i.e. known at the compile time).
+\ref TNL::Algorithms::unrolledFor is a for-loop that it is explicitly unrolled via C++ templates when the loop is short (up to eight iterations).
+The bounds of `unrolledFor` loops must be constant (i.e. known at the compile time).
 It is often used with static arrays and vectors.
 
 See the following example:
 
-\include UnrolledForExample.cpp
+\include unrolledForExample.cpp
 
 Notice that the unrolled for-loop works with a lambda function similar to parallel for-loop.
-The bounds of the loop are passed as template parameters in the statement `Algorithms::UnrolledFor< 0, Size >`.
-The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliary data to be passed to the function.
-The function gets the loop index `i` first followed by the auxiliary data `sum` in this example.
+The bounds of the loop are passed as template parameters in the statement `Algorithms::unrolledFor< int, 0, Size >`.
+The parameter of the `unrolledFor` function is the functor to be called in each iteration.
+The function gets the loop index `i` only, see the following example:
 
 The result looks as:
 
-\include UnrolledForExample.out
+\include unrolledForExample.out
 
-The effect of `UnrolledFor` is really the same as usual for-loop.
+The effect of `unrolledFor` is really the same as usual for-loop.
 The following code does the same as the previous example:
 
 ```cpp
@@ -93,14 +93,14 @@ for( int i = 0; i < Size; i++ )
 };
 ```
 
-The benefit of `UnrolledFor` is mainly in the explicit unrolling of short loops which can improve performance in some situations.
-`UnrolledFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
+The benefit of `unrolledFor` is mainly in the explicit unrolling of short loops which can improve performance in some situations.
+The maximum length of loops that will be fully unrolled can be specified using the fourth template parameter as follows:
 
 ```cpp
-Algorithms::UnrolledFor< 0, Size, true >::exec( addition, 3.14 );
+Algorithms::unrolledFor< int, 0, Size, 16 >( ... );
 ```
 
-`UnrolledFor` can be used also in CUDA kernels.
+`unrolledFor` can be used also in CUDA kernels.
 
 ## Static For
 
diff --git a/src/TNL/Algorithms/UnrolledFor.h b/src/TNL/Algorithms/UnrolledFor.h
deleted file mode 100644
index 710f2a0d3..000000000
--- a/src/TNL/Algorithms/UnrolledFor.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
-                          UnrolledFor.h  -  description
-                             -------------------
-    begin                : Jul 16, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <utility>
-
-#include <TNL/Cuda/CudaCallable.h>
-
-namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief UnrolledFor is a wrapper for common for-loop with explicit unrolling.
- *
- * UnrolledFor can be used only for for-loops bounds of which are known at the
- * compile time. UnrolledFor performs explicit loop unrolling for better performance.
- * This, however, does not make sense for loops with a large iterations
- * count. For a very large iterations count it could trigger the compiler's
- * limit on recursive template instantiation. Also note that the compiler
- * will (at least partially) unroll loops with static bounds anyway. For theses
- * reasons, the explicit loop unrolling can be controlled by the third template
- * parameter.
- *
- * \tparam Begin the loop will iterate over indexes [Begin,End)
- * \tparam End the loop will iterate over indexes [Begin,End)
- * \tparam unrolled controls the explicit loop unrolling. If it is true, the
- *   unrolling is performed.
- *
- * \par Example
- * \include Algorithms/UnrolledForExample.cpp
- * \par Output
- * \include UnrolledForExample.out
- */
-template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
-struct UnrolledFor;
-
-template< int Begin, int End >
-struct UnrolledFor< Begin, End, true >
-{
-   static_assert( Begin < End, "Wrong index interval for UnrolledFor. Begin must be less than end." );
-
-   /**
-    * \brief Static method for the execution of the UnrolledFor.
-    *
-    * \param f is a (lambda) function to be performed in each iteration.
-    * \param args are auxiliary data to be passed to the function f.
-    */
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args )
-   {
-      f( Begin, args... );
-      UnrolledFor< Begin + 1, End >::exec( f, std::forward< Args >( args )... );
-   }
-};
-
-template< int End >
-struct UnrolledFor< End, End, true >
-{
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args ) {}
-};
-
-template< int Begin, int End >
-struct UnrolledFor< Begin, End, false >
-{
-   static_assert( Begin <= End, "Wrong index interval for UnrolledFor. Begin must be less than or equal to end." );
-
-   template< typename Function, typename... Args >
-   __cuda_callable__
-   static void exec( const Function& f, Args&&... args )
-   {
-      for( int i = Begin; i < End; i++ )
-         f( i, std::forward< Args >( args )... );
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/unrolledFor.h b/src/TNL/Algorithms/unrolledFor.h
new file mode 100644
index 000000000..7a5477f52
--- /dev/null
+++ b/src/TNL/Algorithms/unrolledFor.h
@@ -0,0 +1,100 @@
+/***************************************************************************
+                          unrolledFor.h  -  description
+                             -------------------
+    begin                : Jul 16, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <utility>
+
+namespace TNL {
+namespace Algorithms {
+
+namespace detail {
+
+template< typename Index, Index begin, Index end >
+struct UnrolledFor
+{
+   static_assert( begin < end, "internal error - wrong iteration index for UnrolledFor" );
+
+   template< typename Func >
+   static constexpr void exec( Func&& f )
+   {
+      f( begin );
+      UnrolledFor< Index, begin + 1, end >::exec( std::forward< Func >( f ) );
+   }
+};
+
+template< typename Index, Index end >
+struct UnrolledFor< Index, end, end >
+{
+   template< typename Func >
+   static constexpr void exec( Func&& f ) {}
+};
+
+// specialization for short loops - unrolling
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin <= unrollFactor) >
+unrolled_for_dispatch( Func&& f )
+{
+   UnrolledFor< Index, begin, end >::exec( std::forward< Func >( f ) );
+}
+
+// specialization for long loops - normal for-loop
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin > unrollFactor) >
+unrolled_for_dispatch( Func&& f )
+{
+   for( Index i = begin; i < end; i++ )
+      f( i );
+}
+
+// specialization for empty loop
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+unrolled_for_dispatch( Func&& f )
+{}
+
+} // namespace detail
+
+/**
+ * \brief Generic for-loop with explicit unrolling.
+ *
+ * \e unrolledFor performs explicit loop unrolling of short loops which can
+ * improve performance in some cases. The bounds of the for-loop must be constant
+ * (i.e. known at the compile time). Loops longer than \e unrollFactor are not
+ * unrolled and executed as a normal for-loop.
+ *
+ * The unroll factor is configurable, but note that full unrolling does not
+ * make sense for very long loops. It might even trigger the compiler's limit
+ * on recursive template instantiation. Also note that the compiler will (at
+ * least partially) unroll loops with static bounds anyway.
+ *
+ * \tparam Index is the type of the loop indices.
+ * \tparam begin is the left bound of the iteration range `[begin, end)`.
+ * \tparam end is the right bound of the iteration range `[begin, end)`.
+ * \tparam unrollFactor is the maximum length of loops to fully unroll via
+ *    recursive template instantiation.
+ * \tparam Func is the type of the functor (it is usually deduced from the
+ *    argument used in the function call).
+ *
+ * \param f is the functor to be called in each iteration.
+ *
+ * \par Example
+ * \include Algorithms/unrolledForExample.cpp
+ * \par Output
+ * \include unrolledForExample.out
+ */
+template< typename Index, Index begin, Index end, Index unrollFactor = 8,  typename Func >
+constexpr void unrolledFor( Func&& f )
+{
+   detail::unrolled_for_dispatch< Index, begin, end, unrollFactor >( std::forward< Func >( f ) );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index 87e3eea9d..c6c18fb0b 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Math.h>
 #include <TNL/Containers/StaticArray.h>
 #include <TNL/Containers/detail/StaticArrayAssignment.h>
-#include <TNL/Algorithms/UnrolledFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 namespace TNL {
 namespace Containers {
@@ -49,7 +49,7 @@ struct StaticArrayComparator< Size, LeftValue, RightValue, Size >
 ////
 // Static array sort does static loop unrolling of array sort.
 // It performs static variant of bubble sort as follows:
-// 
+//
 // for( int k = Size - 1; k > 0; k--)
 //   for( int i = 0; i < k; i++ )
 //      if( data[ i ] > data[ i+1 ] )
@@ -102,21 +102,33 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value v[ Size ] )
 {
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v[ i ];
+      }
+   );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const Value& v )
 {
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), v );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v;
+      }
+   );
 }
 
 template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >::StaticArray( const StaticArray< Size, Value >& v )
 {
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), v.getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = v[ i ];
+      }
+   );
 }
 
 template< int Size, typename Value >
@@ -228,7 +240,11 @@ template< int Size, typename Value >
 __cuda_callable__
 StaticArray< Size, Value >& StaticArray< Size, Value >::operator=( const StaticArray< Size, Value >& array )
 {
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, getData(), array.getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = array[ i ];
+      }
+   );
    return *this;
 }
 
@@ -264,7 +280,11 @@ StaticArray< Size, Value >::
 operator StaticArray< Size, OtherValue >() const
 {
    StaticArray< Size, OtherValue > aux;
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         aux[ i ] = (*this)[ i ];
+      }
+   );
    return aux;
 }
 
@@ -272,7 +292,11 @@ template< int Size, typename Value >
 __cuda_callable__
 void StaticArray< Size, Value >::setValue( const ValueType& val )
 {
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignValueFunctor{}, getData(), val );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         (*this)[ i ] = val;
+      }
+   );
 }
 
 template< int Size, typename Value >
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index d021cd78d..bb22eba8c 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -99,7 +99,11 @@ StaticVector< Size, Real >::
 operator StaticVector< Size, OtherReal >() const
 {
    StaticVector< Size, OtherReal > aux;
-   Algorithms::UnrolledFor< 0, Size >::exec( detail::AssignArrayFunctor{}, aux.getData(), this->getData() );
+   Algorithms::unrolledFor< int, 0, Size >(
+      [&] ( int i ) mutable {
+         aux[ i ] = (*this)[ i ];
+      }
+   );
    return aux;
 }
 
diff --git a/src/TNL/Containers/detail/StaticArrayAssignment.h b/src/TNL/Containers/detail/StaticArrayAssignment.h
index eae5f474b..0e3f0e366 100644
--- a/src/TNL/Containers/detail/StaticArrayAssignment.h
+++ b/src/TNL/Containers/detail/StaticArrayAssignment.h
@@ -11,32 +11,12 @@
 #pragma once
 
 #include <TNL/TypeTraits.h>
-#include <TNL/Algorithms/UnrolledFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
 
 namespace TNL {
 namespace Containers {
 namespace detail {
 
-struct AssignArrayFunctor
-{
-   template< typename LeftValue, typename RightValue >
-   __cuda_callable__
-   void operator()( int i, LeftValue* data, const RightValue* v ) const
-   {
-      data[ i ] = v[ i ];
-   }
-};
-
-struct AssignValueFunctor
-{
-   template< typename LeftValue, typename RightValue >
-   __cuda_callable__
-   void operator()( int i, LeftValue* data, const RightValue& v ) const
-   {
-      data[ i ] = v;
-   }
-};
-
 template< typename StaticArray,
           typename T,
           bool isStaticArrayType = IsStaticArrayType< T >::value >
@@ -49,11 +29,15 @@ template< typename StaticArray,
           typename T >
 struct StaticArrayAssignment< StaticArray, T, true >
 {
-   __cuda_callable__
-   static void assign( StaticArray& a, const T& v )
+   static constexpr void assign( StaticArray& a, const T& v )
    {
-      static_assert( StaticArray::getSize() == T::getSize(), "Cannot assign static arrays with different size." );
-      Algorithms::UnrolledFor< 0, StaticArray::getSize() >::exec( AssignArrayFunctor{}, a.getData(), v.getData() );
+      static_assert( StaticArray::getSize() == T::getSize(),
+                     "Cannot assign static arrays with different size." );
+      Algorithms::unrolledFor< int, 0, StaticArray::getSize() >(
+         [&] ( int i ) mutable {
+            a[ i ] = v[ i ];
+         }
+      );
    }
 };
 
@@ -65,10 +49,13 @@ template< typename StaticArray,
           typename T >
 struct StaticArrayAssignment< StaticArray, T, false >
 {
-   __cuda_callable__
-   static void assign( StaticArray& a, const T& v )
+   static constexpr void assign( StaticArray& a, const T& v )
    {
-      Algorithms::UnrolledFor< 0, StaticArray::getSize() >::exec( AssignValueFunctor{}, a.getData(), v );
+      Algorithms::unrolledFor< int, 0, StaticArray::getSize() >(
+         [&] ( int i ) mutable {
+            a[ i ] = v;
+         }
+      );
    }
 };
 
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index dd269c8bc..1e4361f49 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -5,6 +5,7 @@ set( COMMON_TESTS
          MultireductionTest
          ParallelForTest
          staticForTest
+         unrolledForTest
 )
 
 set( CPP_TESTS )
diff --git a/src/UnitTests/Algorithms/unrolledForTest.cpp b/src/UnitTests/Algorithms/unrolledForTest.cpp
new file mode 100644
index 000000000..81f9042e5
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.cpp
@@ -0,0 +1 @@
+#include "unrolledForTest.h"
diff --git a/src/UnitTests/Algorithms/unrolledForTest.cu b/src/UnitTests/Algorithms/unrolledForTest.cu
new file mode 100644
index 000000000..81f9042e5
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.cu
@@ -0,0 +1 @@
+#include "unrolledForTest.h"
diff --git a/src/UnitTests/Algorithms/unrolledForTest.h b/src/UnitTests/Algorithms/unrolledForTest.h
new file mode 100644
index 000000000..bda0e9024
--- /dev/null
+++ b/src/UnitTests/Algorithms/unrolledForTest.h
@@ -0,0 +1,124 @@
+/***************************************************************************
+                          unrolledForTest.h  -  description
+                             -------------------
+    begin                : Apr 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <array>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/unrolledFor.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+
+#ifdef HAVE_GTEST
+template< int N >
+void test_host()
+{
+   std::array< int, N > a;
+   a.fill( 0 );
+
+   unrolledFor< int, 0, N >(
+      [&a] ( auto i ) {
+         a[ i ] += 1;
+      }
+   );
+
+   std::array< int, N > expected;
+   expected.fill( 1 );
+   EXPECT_EQ( a, expected );
+}
+
+TEST( unrolledForTest, host_size_8 )
+{
+   test_host<8>();
+}
+
+TEST( unrolledForTest, host_size_97 )
+{
+   test_host<97>();
+}
+
+TEST( unrolledForTest, host_size_5000 )
+{
+   test_host<5000>();
+}
+
+TEST( unrolledForTest, host_empty )
+{
+   bool called = false;
+
+   unrolledFor< int, 0, 0 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+
+   unrolledFor< int, 0, -1 >(
+      [&called] ( auto i ) {
+         called = true;
+      }
+   );
+   EXPECT_FALSE( called );
+}
+
+#ifdef HAVE_CUDA
+template< int N >
+void test_cuda()
+{
+   using Array = Containers::Array< int, Devices::Cuda >;
+   using ArrayHost = Containers::Array< int, Devices::Host >;
+   Array a( N );
+   a.setValue( 0 );
+   auto view = a.getView();
+
+   auto kernel = [=] __cuda_callable__ (int j) mutable
+   {
+      unrolledFor< int, 0, N >(
+         [&view] ( auto i ) {
+            view[ i ] += 1;
+         }
+      );
+   };
+   ParallelFor< Devices::Cuda >::exec( 0, 1, kernel );
+
+   ArrayHost expected;
+   expected.setSize( N );
+   expected.setValue( 1 );
+
+   ArrayHost ah;
+   ah = a;
+   EXPECT_EQ( ah, expected );
+}
+
+TEST( unrolledForTest, cuda_size_8 )
+{
+   test_cuda<8>();
+}
+
+TEST( unrolledForTest, cuda_size_97 )
+{
+   test_cuda<97>();
+}
+
+TEST( unrolledForTest, cuda_size_5000 )
+{
+   test_cuda<5000>();
+}
+#endif
+#endif
+
+#include "../main.h"
-- 
GitLab


From 3c7d5d428c9a718ce929b7241b96082f533f27f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 22:23:44 +0200
Subject: [PATCH 10/13] Minor changes to the Array, ArrayView, Vector and
 VectorView classes

- passing indices and array/vector values by reference is useless
- declaring arguments passed by value as "const" is useless
- using "IndexType" and "ValueType" is better than "Index" and "Value",
  because Doxygen creates an automatic link to the description of the
  type
- various other documentation and whitespace fixes
---
 src/TNL/Containers/Array.h       |  82 +++++++++++-----------
 src/TNL/Containers/Array.hpp     |  46 ++++++-------
 src/TNL/Containers/ArrayView.h   |  92 ++++++++++++-------------
 src/TNL/Containers/ArrayView.hpp | 115 +++++++++++++++++--------------
 src/TNL/Containers/Vector.h      |  10 +--
 src/TNL/Containers/VectorView.h  |   4 +-
 6 files changed, 180 insertions(+), 169 deletions(-)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 53c9290ca..77c85c750 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -136,7 +136,7 @@ class Array
        * \param size The number of array elements to be allocated.
        * \param allocator The allocator to be associated with this array.
        */
-      explicit Array( const IndexType& size, const AllocatorType& allocator = AllocatorType() );
+      explicit Array( IndexType size, const AllocatorType& allocator = AllocatorType() );
 
       /**
        * \brief Constructs an array with given size and value.
@@ -145,7 +145,7 @@ class Array
        * \param value The value all elements will be set to.
        * \param allocator The allocator to be associated with this array.
        */
-      explicit Array( const IndexType& size, const Value& value, const AllocatorType& allocator = AllocatorType() );
+      explicit Array( IndexType size, ValueType value, const AllocatorType& allocator = AllocatorType() );
 
       /**
        * \brief Constructs an array with given size and copies data from given
@@ -155,8 +155,8 @@ class Array
        * \param size The number of array elements to be copied to the array.
        * \param allocator The allocator to be associated with this array.
        */
-      Array( Value* data,
-             const IndexType& size,
+      Array( ValueType* data,
+             IndexType size,
              const AllocatorType& allocator = AllocatorType() );
 
       /**
@@ -287,7 +287,7 @@ class Array
        *
        * \param size The new size of the array.
        */
-      void resize( Index size );
+      void resize( IndexType size );
 
       /**
        * \brief Method for resizing the array with an initial value.
@@ -306,7 +306,7 @@ class Array
        * \param size The new size of the array.
        * \param value The value to initialize new elements with.
        */
-      void resize( Index size, const ValueType& value );
+      void resize( IndexType size, ValueType value );
 
       /**
        * \brief Method for setting the array size.
@@ -318,14 +318,14 @@ class Array
        *
        * \param size The new size of the array.
        */
-      void setSize( Index size );
+      void setSize( IndexType size );
 
       /**
        * \brief Returns the current array size.
        *
        * This method can be called from device kernels.
        */
-      __cuda_callable__ Index getSize() const;
+      __cuda_callable__ IndexType getSize() const;
 
       /**
        * \brief Sets the same size as the size of an existing array.
@@ -448,10 +448,10 @@ class Array
        * where the array is allocated.
        *
        * \param i The index of the element to be set.
-       * \param v The new value of the element.
+       * \param value The new value of the element.
        */
       __cuda_callable__
-      void setElement( const Index& i, const Value& v );
+      void setElement( IndexType i, ValueType value );
 
       /**
        * \brief Returns the value of the \e i-th element.
@@ -462,7 +462,7 @@ class Array
        * \param i The index of the element to be returned.
        */
       __cuda_callable__
-      Value getElement( const Index& i ) const;
+      ValueType getElement( IndexType i ) const;
 
       /**
        * \brief Accesses the \e i-th element of the array.
@@ -479,7 +479,7 @@ class Array
        * \param i The index of the element to be accessed.
        * \return Reference to the \e i-th element.
        */
-      __cuda_callable__ Value& operator[]( const Index& i );
+      __cuda_callable__ Value& operator[]( IndexType i );
 
       /**
        * \brief Accesses the \e i-th element of the array.
@@ -496,7 +496,7 @@ class Array
        * \param i The index of the element to be accessed.
        * \return Constant reference to the \e i-th element.
        */
-      __cuda_callable__ const Value& operator[]( const Index& i ) const;
+      __cuda_callable__ const Value& operator[]( IndexType i ) const;
 
       /**
        * \brief Copy-assignment operator for copying data from another array.
@@ -557,7 +557,7 @@ class Array
        *         container, e.g. \ref Array, \ref ArrayView, \ref Vector,
        *         \ref VectorView, etc.
        * \param array Reference to the array-like container.
-       * \return \ref True if both arrays are element-wise equal and \ref false
+       * \return `true` if both arrays are element-wise equal and `false`
        *         otherwise.
        */
       template< typename ArrayT >
@@ -582,13 +582,13 @@ class Array
        * or \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are set.
        *
-       * \param v The new value for the array elements.
+       * \param value The new value for the array elements.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
        */
-      void setValue( const ValueType& v,
+      void setValue( ValueType value,
                      IndexType begin = 0,
                      IndexType end = 0 );
 
@@ -603,8 +603,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -633,8 +633,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -663,8 +663,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -691,8 +691,8 @@ class Array
        *
        * where
        *
-       * \param elementIdx is an index of the array element being currently processed
-       * \param elementValue is a value of the array element being currently processed
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
        *
        * This is performed at the same place where the array is allocated,
        * i.e. it is efficient even on GPU.
@@ -727,7 +727,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -744,7 +744,7 @@ class Array
       template< typename Fetch,
                 typename Reduce,
                 typename Result >
-      Result reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
+      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
 
        /**
         * \brief Computes reduction with array elements on interval [ \e begin, \e end) for constant instances.
@@ -765,7 +765,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -782,7 +782,7 @@ class Array
       template< typename Fetch,
                 typename Reduce,
                 typename Result >
-      Result reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
+      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
 
        /**
         * \brief Computes reduction with all array elements.
@@ -801,7 +801,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -837,7 +837,7 @@ class Array
         * being currently processed:
         *
         * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
         * ```
         *
         * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -863,15 +863,15 @@ class Array
        * \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are checked.
        *
-       * \param v The value to be checked.
+       * \param value The value to be checked.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
-       * \return True if there is _at least one_ element in the sub-interval
-       *         `[begin, end)` which has the value \e v.
+       * \return `true` if there is _at least one_ element in the sub-interval
+       *         `[begin, end)` which has the value \e value.
        */
-      bool containsValue( const ValueType& v,
+      bool containsValue( ValueType value,
                           IndexType begin = 0,
                           IndexType end = 0 ) const;
 
@@ -882,15 +882,15 @@ class Array
        * \e end is set to a non-zero value, only elements in the sub-interval
        * `[begin, end)` are checked.
        *
-       * \param v The value to be checked.
+       * \param value The value to be checked.
        * \param begin The beginning of the array sub-interval. It is 0 by
        *              default.
        * \param end The end of the array sub-interval. The default value is 0
        *            which is, however, replaced with the array size.
-       * \return True if there is _all_ elements in the sub-interval
-       *         `[begin, end)` have the same value \e v.
+       * \return `true` if _all_ elements in the sub-interval `[begin, end)`
+       *         have the same value \e value.
        */
-      bool containsOnlyValue( const ValueType& v,
+      bool containsOnlyValue( ValueType value,
                               IndexType begin = 0,
                               IndexType end = 0 ) const;
 
@@ -919,13 +919,13 @@ class Array
       /** \brief Internal method for reallocating array elements. Used only
        * from the two overloads of \ref resize.
        */
-      void reallocate( Index size );
+      void reallocate( IndexType size );
 
       /** \brief Pointer to the data. */
       Value* data = nullptr;
 
       /** \brief Number of elements in the array. */
-      Index size = 0;
+      IndexType size = 0;
 
       /**
        * \brief The internal allocator instance.
@@ -941,7 +941,7 @@ class Array
  * \tparam Index is a type used for the indexing of the array elements.
  *
  * \param str is a output stream.
- * \param view is the array to be printed.
+ * \param array is the array to be printed.
  *
  * \return a reference on the output stream \ref std::ostream&.
  */
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index f313a4cf5..d310534a3 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -52,7 +52,7 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( const IndexType& size, const AllocatorType& allocator )
+Array( IndexType size, const AllocatorType& allocator )
 : allocator( allocator )
 {
    this->setSize( size );
@@ -63,7 +63,7 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( const IndexType& size, const Value& value, const AllocatorType& allocator )
+Array( IndexType size, ValueType value, const AllocatorType& allocator )
 : allocator( allocator )
 {
    this->setSize( size );
@@ -75,8 +75,8 @@ template< typename Value,
           typename Index,
           typename Allocator >
 Array< Value, Device, Index, Allocator >::
-Array( Value* data,
-       const IndexType& size,
+Array( ValueType* data,
+       IndexType size,
        const AllocatorType& allocator )
 : allocator( allocator )
 {
@@ -244,7 +244,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-reallocate( Index size )
+reallocate( IndexType size )
 {
    TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." );
 
@@ -288,10 +288,10 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-resize( Index size )
+resize( IndexType size )
 {
    // remember the old size and reallocate the array
-   const Index old_size = this->size;
+   const IndexType old_size = this->size;
    reallocate( size );
 
    if( old_size < size )
@@ -306,10 +306,10 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-resize( Index size, const ValueType& value )
+resize( IndexType size, ValueType value )
 {
    // remember the old size and reallocate the array
-   const Index old_size = this->size;
+   const IndexType old_size = this->size;
    reallocate( size );
 
    if( old_size < size )
@@ -323,7 +323,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-setSize( Index size )
+setSize( IndexType size )
 {
    TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." );
 
@@ -495,7 +495,7 @@ template< typename Value,
           typename Allocator >
 __cuda_callable__ void
 Array< Value, Device, Index, Allocator >::
-setElement( const Index& i, const Value& x )
+setElement( IndexType i, ValueType x )
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -508,7 +508,7 @@ template< typename Value,
           typename Allocator >
 __cuda_callable__ Value
 Array< Value, Device, Index, Allocator >::
-getElement( const Index& i ) const
+getElement( IndexType i ) const
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -522,7 +522,7 @@ template< typename Value,
 __cuda_callable__
 Value&
 Array< Value, Device, Index, Allocator >::
-operator[]( const Index& i )
+operator[]( IndexType i )
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -541,7 +541,7 @@ template< typename Value,
 __cuda_callable__
 const Value&
 Array< Value, Device, Index, Allocator >::
-operator[]( const Index& i ) const
+operator[]( IndexType i ) const
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -647,9 +647,7 @@ operator==( const ArrayT& array ) const
    if( this->getSize() == 0 )
       return true;
    return Algorithms::MultiDeviceMemoryOperations< Device, typename ArrayT::DeviceType >::
-            compare( this->getData(),
-                           array.getData(),
-                           array.getSize() );
+            compare( this->getData(), array.getData(), array.getSize() );
 }
 
 template< typename Value,
@@ -670,7 +668,7 @@ template< typename Value,
           typename Allocator >
 void
 Array< Value, Device, Index, Allocator >::
-setValue( const ValueType& v,
+setValue( ValueType v,
           IndexType begin,
           IndexType end )
 {
@@ -742,7 +740,7 @@ template< typename Value,
          typename Result >
 Result
 Array< Value, Device, Index, Allocator >::
-reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    return this->getView().reduceElements( begin, end, fetch, reduce, zero );
 }
@@ -756,7 +754,7 @@ template< typename Value,
          typename Result >
 Result
 Array< Value, Device, Index, Allocator >::
-reduceElements( const Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    return this->getConstView().reduceElements( begin, end, fetch, reduce, zero );
 }
@@ -795,7 +793,7 @@ template< typename Value,
           typename Allocator >
 bool
 Array< Value, Device, Index, Allocator >::
-containsValue( const ValueType& v,
+containsValue( ValueType value,
                IndexType begin,
                IndexType end ) const
 {
@@ -803,7 +801,7 @@ containsValue( const ValueType& v,
    if( end == 0 )
       end = this->getSize();
 
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, v );
+   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
 }
 
 template< typename Value,
@@ -812,7 +810,7 @@ template< typename Value,
           typename Allocator >
 bool
 Array< Value, Device, Index, Allocator >::
-containsOnlyValue( const ValueType& v,
+containsOnlyValue( ValueType value,
                    IndexType begin,
                    IndexType end ) const
 {
@@ -820,7 +818,7 @@ containsOnlyValue( const ValueType& v,
    if( end == 0 )
       end = this->getSize();
 
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, v );
+   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 4b3846037..eb7e548b0 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -119,7 +119,7 @@ public:
     * \param size The number of elements in the array view.
     */
    __cuda_callable__
-   ArrayView( Value* data, Index size );
+   ArrayView( ValueType* data, IndexType size );
 
    /**
     * \brief Shallow copy constructor.
@@ -165,7 +165,7 @@ public:
     * \param size The number of elements in the array view.
     */
    __cuda_callable__
-   void bind( Value* data, const Index size );
+   void bind( ValueType* data, IndexType size );
 
    /**
     * \brief Method for rebinding (reinitialization) using another array view.
@@ -193,7 +193,7 @@ public:
     *            which is, however, replaced with the array size.
     */
    __cuda_callable__
-   ViewType getView( const IndexType begin = 0, IndexType end = 0 );
+   ViewType getView( IndexType begin = 0, IndexType end = 0 );
 
    /**
     * \brief Returns a non-modifiable view of the array view.
@@ -208,7 +208,7 @@ public:
     *            which is, however, replaced with the array size.
     */
    __cuda_callable__
-   ConstViewType getConstView( const IndexType begin = 0, IndexType end = 0 ) const;
+   ConstViewType getConstView( IndexType begin = 0, IndexType end = 0 ) const;
 
    /**
     * \brief Deep copy assignment operator for copying data from another array
@@ -270,7 +270,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   const Value* getData() const;
+   const ValueType* getData() const;
 
    /**
     * \brief Returns a raw pointer to the data.
@@ -278,7 +278,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Value* getData();
+   ValueType* getData();
 
    /**
     * \brief Returns a \e const-qualified raw pointer to the data.
@@ -289,7 +289,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   const Value* getArrayData() const;
+   const ValueType* getArrayData() const;
 
    /**
     * \brief Returns a raw pointer to the data.
@@ -300,7 +300,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Value* getArrayData();
+   ValueType* getArrayData();
 
    /**
     * \brief Returns the current size of the array view.
@@ -308,7 +308,7 @@ public:
     * This method can be called from device kernels.
     */
    __cuda_callable__
-   Index getSize() const;
+   IndexType getSize() const;
 
    /**
     * \brief Sets the value of the \e i-th element to \e v.
@@ -317,10 +317,10 @@ public:
     * where the array is allocated.
     *
     * \param i The index of the element to be set.
-    * \param v The new value of the element.
+    * \param value The new value of the element.
     */
    __cuda_callable__
-   void setElement( Index i, Value value );
+   void setElement( IndexType i, ValueType value );
 
    /**
     * \brief Returns the value of the \e i-th element.
@@ -331,7 +331,7 @@ public:
     * \param i The index of the element to be returned.
     */
    __cuda_callable__
-   Value getElement( Index i ) const;
+   ValueType getElement( IndexType i ) const;
 
    /**
     * \brief Accesses the \e i-th element of the array view.
@@ -349,7 +349,7 @@ public:
     * \return Reference to the \e i-th element.
     */
    __cuda_callable__
-   Value& operator[]( Index i );
+   Value& operator[]( IndexType i );
 
    /**
     * \brief Accesses the \e i-th element of the array view.
@@ -367,7 +367,7 @@ public:
     * \return Constant reference to the \e i-th element.
     */
    __cuda_callable__
-   const Value& operator[]( Index i ) const;
+   const Value& operator[]( IndexType i ) const;
 
    /**
     * \brief Compares the array view with another array-like container.
@@ -401,15 +401,15 @@ public:
     * \e begin or \e end is set to a non-zero value, only elements in the
     * sub-interval `[begin, end)` are set.
     *
-    * \param v The new value for the array view elements.
+    * \param value The new value for the array view elements.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
     */
-   void setValue( Value value,
-                  const Index begin = 0,
-                  Index end = 0 );
+   void setValue( ValueType value,
+                  IndexType begin = 0,
+                  IndexType end = 0 );
 
    /**
     * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
@@ -422,8 +422,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -452,8 +452,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place
     * where the array is allocated, i.e. it is efficient even on GPU.
@@ -481,8 +481,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -509,8 +509,8 @@ public:
     *
     * where
     *
-    * \param elementIdx is an index of the array element being currently processed
-    * \param elementValue is a value of the array element being currently processed
+    * - \e elementIdx is an index of the array element being currently processed
+    * - \e elementValue is a value of the array element being currently processed
     *
     * This is performed at the same place where the array is allocated,
     * i.e. it is efficient even on GPU.
@@ -545,7 +545,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -562,7 +562,7 @@ public:
    template< typename Fetch,
              typename Reduce,
              typename Result >
-   Result reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
+   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
 
    /**
     * \brief Computes reduction with array view elements on interval [ \e begin, \e end) for constant instances.
@@ -583,7 +583,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -600,7 +600,7 @@ public:
    template< typename Fetch,
              typename Reduce,
              typename Result >
-   Result reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
+   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
 
    /**
     * \brief Computes reduction with all array view elements.
@@ -619,7 +619,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -655,7 +655,7 @@ public:
     * being currently processed:
     *
     * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( Index idx, Value& value ) -> Result { return ... };
+    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
     * ```
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
@@ -681,17 +681,17 @@ public:
     * \e end is set to a non-zero value, only elements in the sub-interval
     * `[begin, end)` are checked.
     *
-    * \param v The value to be checked.
+    * \param value The value to be checked.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
-    * \return True if there is _at least one_ element in the sub-interval
-    *         `[begin, end)` which has the value \e v.
+    * \return `true` if there is _at least one_ element in the sub-interval
+    *         `[begin, end)` which has the value \e value.
     */
-   bool containsValue( Value value,
-                       const Index begin = 0,
-                       Index end = 0  ) const;
+   bool containsValue( ValueType value,
+                       IndexType begin = 0,
+                       IndexType end = 0 ) const;
 
    /**
     * \brief Checks if all elements have the same value \e v.
@@ -700,17 +700,17 @@ public:
     * \e end is set to a non-zero value, only elements in the sub-interval
     * `[begin, end)` are checked.
     *
-    * \param v The value to be checked.
+    * \param value The value to be checked.
     * \param begin The beginning of the array view sub-interval. It is 0 by
     *              default.
     * \param end The end of the array view sub-interval. The default value is 0
     *            which is, however, replaced with the array view size.
-    * \return True if there is _all_ elements in the sub-interval
-    *         `[begin, end)` have the same value \e v.
+    * \return `true` if _all_ elements in the sub-interval `[begin, end)`
+    *         have the same value \e value.
     */
-   bool containsOnlyValue( Value value,
-                           const Index begin = 0,
-                           Index end = 0  ) const;
+   bool containsOnlyValue( ValueType value,
+                           IndexType begin = 0,
+                           IndexType end = 0 ) const;
 
    /**
     * \brief Method for saving the data to a binary file \e fileName.
@@ -728,10 +728,10 @@ public:
 
 protected:
    //! Pointer to the data
-   Value* data = nullptr;
+   ValueType* data = nullptr;
 
    //! Array view size
-   Index size = 0;
+   IndexType size = 0;
 };
 
 /**
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index eeb0b1b4b..a2174242f 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -32,7 +32,8 @@ template< typename Value,
           typename Index >
 __cuda_callable__
 ArrayView< Value, Device, Index >::
-ArrayView( Value* data, Index size ) : data(data), size(size)
+ArrayView( ValueType* data, IndexType size )
+: data(data), size(size)
 {
    TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
@@ -46,7 +47,7 @@ template< typename Value,
 __cuda_callable__
 void
 ArrayView< Value, Device, Index >::
-bind( Value* data, Index size )
+bind( ValueType* data, IndexType size )
 {
    TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
@@ -60,7 +61,9 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-void ArrayView< Value, Device, Index >::bind( ArrayView view )
+void
+ArrayView< Value, Device, Index >::
+bind( ArrayView view )
 {
    bind( view.getData(), view.getSize() );
 }
@@ -71,7 +74,7 @@ template< typename Value,
 __cuda_callable__
 typename ArrayView< Value, Device, Index >::ViewType
 ArrayView< Value, Device, Index >::
-getView( const IndexType begin, IndexType end )
+getView( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
@@ -84,7 +87,7 @@ template< typename Value,
 __cuda_callable__
 typename ArrayView< Value, Device, Index >::ConstViewType
 ArrayView< Value, Device, Index >::
-getConstView( const IndexType begin, IndexType end ) const
+getConstView( IndexType begin, IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
@@ -157,8 +160,8 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value* ArrayView< Value, Device, Index >::
+const Value*
+ArrayView< Value, Device, Index >::
 getData() const
 {
    return data;
@@ -179,8 +182,8 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value* ArrayView< Value, Device, Index >::
+const Value*
+ArrayView< Value, Device, Index >::
 getArrayData() const
 {
    return data;
@@ -214,7 +217,7 @@ template< typename Value,
 __cuda_callable__
 void
 ArrayView< Value, Device, Index >::
-setElement( Index i, Value value )
+setElement( IndexType i, ValueType value )
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -226,7 +229,7 @@ template< typename Value,
           typename Index >
 __cuda_callable__ Value
 ArrayView< Value, Device, Index >::
-getElement( Index i ) const
+getElement( IndexType i ) const
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
@@ -238,7 +241,7 @@ template< typename Value,
           typename Index >
 __cuda_callable__
 Value& ArrayView< Value, Device, Index >::
-operator[]( Index i )
+operator[]( IndexType i )
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -254,9 +257,9 @@ template< typename Value,
           typename Device,
           typename Index >
 __cuda_callable__
-const
-Value& ArrayView< Value, Device, Index >::
-operator[]( Index i ) const
+const Value&
+ArrayView< Value, Device, Index >::
+operator[]( IndexType i ) const
 {
 #ifdef __CUDA_ARCH__
    TNL_ASSERT_TRUE( (std::is_same< Device, Devices::Cuda >{}()), "Attempt to access data not allocated on CUDA device from CUDA device." );
@@ -302,7 +305,7 @@ template< typename Value,
           typename Index >
 void
 ArrayView< Value, Device, Index >::
-setValue( Value value, const Index begin, Index end )
+setValue( ValueType value, IndexType begin, IndexType end )
 {
    TNL_ASSERT_GT( size, 0, "Attempted to set value to an empty array view." );
    if( end == 0 )
@@ -314,14 +317,15 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
-forElements( const Index begin, Index end, Function&& f )
+void
+ArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f )
 {
    if( ! this->data )
       return;
 
    ValueType* d = this->getData();
-   auto g = [=] __cuda_callable__ ( Index i ) mutable
+   auto g = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       f( i, d[ i ] );
    };
@@ -332,14 +336,15 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
-forElements( const Index begin, Index end, Function&& f ) const
+void
+ArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
 {
    if( ! this->data )
       return;
 
    const ValueType* d = this->getData();
-   auto g = [=] __cuda_callable__ ( Index i )
+   auto g = [=] __cuda_callable__ ( IndexType i )
    {
       f( i, d[ i ] );
    };
@@ -350,7 +355,8 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
+void
+ArrayView< Value, Device, Index >::
 forAllElements( Function&& f )
 {
    this->forElements( 0, this->getSize(), f );
@@ -360,7 +366,8 @@ template< typename Value,
           typename Device,
           typename Index >
    template< typename Function >
-void ArrayView< Value, Device, Index >::
+void
+ArrayView< Value, Device, Index >::
 forAllElements( Function&& f ) const
 {
    this->forElements( 0, this->getSize(), f );
@@ -372,8 +379,9 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
-reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
+Result
+ArrayView< Value, Device, Index >::
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    if( ! this->data )
       return zero;
@@ -389,8 +397,9 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
-reduceElements( Index begin, Index end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
+Result
+ArrayView< Value, Device, Index >::
+reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    if( ! this->data )
       return;
@@ -406,7 +415,8 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
+Result
+ArrayView< Value, Device, Index >::
 reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
 {
    return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
@@ -418,7 +428,8 @@ template< typename Value,
    template< typename Fetch,
              typename Reduce,
              typename Result >
-Result ArrayView< Value, Device, Index >::
+Result
+ArrayView< Value, Device, Index >::
 reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
 {
    return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
@@ -429,9 +440,9 @@ template< typename Value,
           typename Index >
 bool
 ArrayView< Value, Device, Index >::
-containsValue( Value value,
-               const Index begin,
-               Index end ) const
+containsValue( ValueType value,
+               IndexType begin,
+               IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
@@ -443,33 +454,21 @@ template< typename Value,
           typename Index >
 bool
 ArrayView< Value, Device, Index >::
-containsOnlyValue( Value value,
-                   const Index begin,
-                   Index end  ) const
+containsOnlyValue( ValueType value,
+                   IndexType begin,
+                   IndexType end ) const
 {
    if( end == 0 )
       end = this->getSize();
    return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
 }
 
-template< typename Value, typename Device, typename Index >
-std::ostream& operator<<( std::ostream& str, const ArrayView< Value, Device, Index >& view )
-{
-   str << "[ ";
-   if( view.getSize() > 0 )
-   {
-      str << view.getElement( 0 );
-      for( Index i = 1; i < view.getSize(); i++ )
-         str << ", " << view.getElement( i );
-   }
-   str << " ]";
-   return str;
-}
-
 template< typename Value,
           typename Device,
           typename Index >
-void ArrayView< Value, Device, Index >::save( const String& fileName ) const
+void
+ArrayView< Value, Device, Index >::
+save( const String& fileName ) const
 {
    File( fileName, std::ios_base::out ) << *this;
 }
@@ -484,6 +483,20 @@ load( const String& fileName )
    File( fileName, std::ios_base::in ) >> *this;
 }
 
+template< typename Value, typename Device, typename Index >
+std::ostream& operator<<( std::ostream& str, const ArrayView< Value, Device, Index >& view )
+{
+   str << "[ ";
+   if( view.getSize() > 0 )
+   {
+      str << view.getElement( 0 );
+      for( Index i = 1; i < view.getSize(); i++ )
+         str << ", " << view.getElement( i );
+   }
+   str << " ]";
+   return str;
+}
+
 // Serialization of array views into binary files.
 template< typename Value, typename Device, typename Index >
 File& operator<<( File& file, const ArrayView< Value, Device, Index > view )
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 9d09ecd50..859e326d4 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -22,8 +22,8 @@ namespace Containers {
  * The template parameters have the same meaning as in \ref Array, with \e Real
  * corresponding to \e Array's \e Value parameter.
  *
- * \tparam Real   An arithmetic type for the vector values, e.g. \ref float or
- *                \ref double.
+ * \tparam Real   An arithmetic type for the vector values, e.g. `float` or
+ *                `double`.
  * \tparam Device The device to be used for the execution of vector operations.
  * \tparam Index  The indexing type.
  * \tparam Allocator The type of the allocator used for the allocation and
@@ -51,7 +51,7 @@ public:
 
    /**
     * \brief Device where the vector is allocated.
-    * 
+    *
     * See \ref Devices::Host or \ref Devices::Cuda.
     */
    using DeviceType = Device;
@@ -63,7 +63,7 @@ public:
 
    /**
     * \brief Allocator type used for allocating this vector.
-    * 
+    *
     * See \ref Allocators::Cuda, \ref Allocators::CudaHost, \ref Allocators::CudaManaged, \ref Allocators::Host or \ref Allocators:Default.
     */
    using AllocatorType = Allocator;
@@ -114,7 +114,7 @@ public:
 
    /**
     * \brief Constructor from expression template
-    * 
+    *
     * @param expression input expression template
     */
    template< typename VectorExpression,
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 83ec6d0b0..2416b8509 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -25,8 +25,8 @@ namespace Containers {
  * The template parameters have the same meaning as in \ref ArrayView, with
  * \e Real corresponding to \e ArrayView's \e Value parameter.
  *
- * \tparam Real   An arithmetic type for the vector values, e.g. \ref float or
- *                \ref double.
+ * \tparam Real   An arithmetic type for the vector values, e.g. `float` or
+ *                `double`.
  * \tparam Device The device to be used for the execution of vector operations.
  * \tparam Index  The indexing type.
  */
-- 
GitLab


From 81232c6685c5448c63b63e38067a305545ad2e57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 22:43:37 +0200
Subject: [PATCH 11/13] Moved CudaMultireductionKernel, CudaReductionKernel and
 CudaScanKernel into a 'detail' namespace

The kernels are not interesting for users and Doxygen made a mess out of
various CUDA declarations. The 'detail' namespace is considered internal
and excluded from processing by Doxygen.
---
 src/TNL/Algorithms/Multireduction.hpp         |  4 +--
 src/TNL/Algorithms/Reduction.hpp              |  6 ++--
 src/TNL/Algorithms/Scan.hpp                   |  8 ++---
 .../{ => detail}/CudaMultireductionKernel.h   |  2 ++
 .../{ => detail}/CudaReductionKernel.h        |  2 ++
 .../Algorithms/{ => detail}/CudaScanKernel.h  |  2 ++
 .../Containers/DistributedVectorTest.h        | 32 +++++++++----------
 .../Containers/VectorPrefixSumTest.h          | 32 +++++++++----------
 8 files changed, 47 insertions(+), 41 deletions(-)
 rename src/TNL/Algorithms/{ => detail}/CudaMultireductionKernel.h (99%)
 rename src/TNL/Algorithms/{ => detail}/CudaReductionKernel.h (99%)
 rename src/TNL/Algorithms/{ => detail}/CudaScanKernel.h (99%)

diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp
index 0bfead287..4eb8a9369 100644
--- a/src/TNL/Algorithms/Multireduction.hpp
+++ b/src/TNL/Algorithms/Multireduction.hpp
@@ -19,7 +19,7 @@
 #include <TNL/Assert.h>
 #include <TNL/Algorithms/Multireduction.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
-#include <TNL/Algorithms/CudaMultireductionKernel.h>
+#include <TNL/Algorithms/detail/CudaMultireductionKernel.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
 #include <TNL/Timer.h>
@@ -212,7 +212,7 @@ reduce( const Result zero,
 
    // start the reduction on the GPU
    Result* deviceAux1 = nullptr;
-   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
+   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/Reduction.hpp
index 7873f9c3c..3c029d9ab 100644
--- a/src/TNL/Algorithms/Reduction.hpp
+++ b/src/TNL/Algorithms/Reduction.hpp
@@ -17,7 +17,7 @@
 //#define CUDA_REDUCTION_PROFILING
 
 #include <TNL/Algorithms/Reduction.h>
-#include <TNL/Algorithms/CudaReductionKernel.h>
+#include <TNL/Algorithms/detail/CudaReductionKernel.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
@@ -311,7 +311,7 @@ reduce( const Index begin,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
+   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
 
    // start the reduce on the GPU
    Result* deviceAux1( 0 );
@@ -401,7 +401,7 @@ reduceWithArgument( const Index begin,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
+   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
 
    // start the reduce on the GPU
    Result* deviceAux1( nullptr );
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 74351077e..78d5eaf60 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -17,7 +17,7 @@
 #include <TNL/Assert.h>
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/StaticArray.h>
-#include <TNL/Algorithms/CudaScanKernel.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -227,7 +227,7 @@ perform( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
+   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -253,7 +253,7 @@ performFirstPhase( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   return CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -281,7 +281,7 @@ performSecondPhase( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
       end - begin,
       &v.getData()[ begin ],  // output
       blockShifts.getData(),
diff --git a/src/TNL/Algorithms/CudaMultireductionKernel.h b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaMultireductionKernel.h
rename to src/TNL/Algorithms/detail/CudaMultireductionKernel.h
index a7979fb7f..973d8e958 100644
--- a/src/TNL/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
@@ -21,6 +21,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 #ifdef HAVE_CUDA
 /****
@@ -281,5 +282,6 @@ CudaMultireductionKernelLauncher( const Result zero,
 #endif
 }
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaReductionKernel.h
rename to src/TNL/Algorithms/detail/CudaReductionKernel.h
index c3d981fd5..51f38f18b 100644
--- a/src/TNL/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -21,6 +21,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 /****
  * The performance of this kernel is very sensitive to register usage.
@@ -642,5 +643,6 @@ struct CudaReductionKernelLauncher
       Index reducedSize;
 };
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
similarity index 99%
rename from src/TNL/Algorithms/CudaScanKernel.h
rename to src/TNL/Algorithms/detail/CudaScanKernel.h
index 97912b234..63072ea89 100644
--- a/src/TNL/Algorithms/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -19,6 +19,7 @@
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 #ifdef HAVE_CUDA
 
@@ -386,5 +387,6 @@ struct CudaScanKernelLauncher
 
 #endif
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index a90f09506..8dc9d6d26 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -151,12 +151,12 @@ TYPED_TEST( DistributedVectorTest, scan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -164,7 +164,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -172,7 +172,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -181,7 +181,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -189,7 +189,7 @@ TYPED_TEST( DistributedVectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -197,12 +197,12 @@ TYPED_TEST( DistributedVectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -270,12 +270,12 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -283,7 +283,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -291,7 +291,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -300,7 +300,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -308,7 +308,7 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -316,12 +316,12 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 7f2151c5e..3c52e9eef 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -83,12 +83,12 @@ TYPED_TEST( VectorTest, scan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -96,7 +96,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
@@ -104,7 +104,7 @@ TYPED_TEST( VectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -113,7 +113,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -121,7 +121,7 @@ TYPED_TEST( VectorTest, scan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
@@ -129,12 +129,12 @@ TYPED_TEST( VectorTest, scan )
       setLinearSequence( v );
       v_host = -1;
       v_view.scan();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -206,12 +206,12 @@ TYPED_TEST( VectorTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -219,7 +219,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
@@ -227,7 +227,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -236,7 +236,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
@@ -244,7 +244,7 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
@@ -252,12 +252,12 @@ TYPED_TEST( VectorTest, exclusiveScan )
       setLinearSequence( v );
       v_host = -1;
       v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
-- 
GitLab


From addfae5469c42a259aabdd72d3771db21afc5c4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 22:50:21 +0200
Subject: [PATCH 12/13] Various documentation fixes

---
 src/TNL/Containers/StaticArray.h | 26 +++++++++++++-------------
 src/TNL/Meshes/Mesh.h            | 10 +++++-----
 src/TNL/Meshes/MeshEntity.h      |  9 +++++++++
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/TNL/Containers/StaticArray.h b/src/TNL/Containers/StaticArray.h
index 5702d9fe7..4f7f753c2 100644
--- a/src/TNL/Containers/StaticArray.h
+++ b/src/TNL/Containers/StaticArray.h
@@ -52,7 +52,7 @@ public:
    /**
     * \brief Constructor from static array.
     *
-    * \param v[Size] input array.
+    * \param v Input array.
     */
    // Note: the template avoids ambiguity of overloaded functions with literal 0 and pointer
    // reference: https://stackoverflow.com/q/4610503
@@ -79,9 +79,9 @@ public:
    /**
     * \brief Constructor which initializes the array by copying elements from
     * \ref std::initializer_list, e.g. `{...}`.
-    * 
+    *
     * The initializer list size must larger or equal to \e Size.
-    * 
+    *
     * @param elems input initializer list
     */
    __cuda_callable__
@@ -179,9 +179,9 @@ public:
 
    /**
     * \brief Assigns an object \e v of type \e T.
-    * 
+    *
     * T can be:
-    * 
+    *
     * 1. Static linear container implementing operator[] and having the same size.
     * In this case, \e v is copied to this array elementwise.
     * 2. An object that can be converted to \e Value type. In this case all elements
@@ -211,13 +211,13 @@ public:
 
    /**
     * \brief Cast operator for changing of the \e Value type.
-    * 
+    *
     * Returns static array having \e ValueType set to \e OtherValue, i.e.
     * StaticArray< Size, OtherValue >.
-    * 
-    * \tparam OtherValue is the \e Value type of the static array the casting 
+    *
+    * \tparam OtherValue is the \e Value type of the static array the casting
     * will be performed to.
-    * 
+    *
     * \return instance of StaticArray< Size, OtherValue >
     */
    template< typename OtherValue >
@@ -265,7 +265,7 @@ std::ostream& operator<<( std::ostream& str, const StaticArray< Size, Value >& a
 
 /**
  * \brief Serialization of static arrays into binary files.
- * 
+ *
  * \param file output file
  * \param array is an array to be written into the output file.
  */
@@ -274,7 +274,7 @@ File& operator<<( File& file, const StaticArray< Size, Value >& array );
 
 /**
  * \brief Serialization of static arrays into binary files.
- * 
+ *
  * \param file output file
  * \param array is an array to be written into the output file.
  */
@@ -283,7 +283,7 @@ File& operator<<( File&& file, const StaticArray< Size, Value >& array );
 
 /**
  * \brief Deserialization of static arrays from binary files.
- * 
+ *
  * \param file input file
  * \param array is an array to be read from the input file.
  */
@@ -292,7 +292,7 @@ File& operator>>( File& file, StaticArray< Size, Value >& array );
 
 /**
  * \brief Deserialization of static arrays from binary files.
- * 
+ *
  * \param file input file
  * \param array is an array to be read from the input file.
  */
diff --git a/src/TNL/Meshes/Mesh.h b/src/TNL/Meshes/Mesh.h
index fa5cfeb58..41e24412b 100644
--- a/src/TNL/Meshes/Mesh.h
+++ b/src/TNL/Meshes/Mesh.h
@@ -202,7 +202,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forAll( Func f ) const;
@@ -213,7 +213,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forBoundary( Func f ) const;
@@ -224,7 +224,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forInterior( Func f ) const;
@@ -235,7 +235,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forLocal( Func f ) const;
@@ -246,7 +246,7 @@ class Mesh
        * The function \e f is executed as `f(i)`, where `GlobalIndexType i` is the global index of the
        * mesh entity to be processed. The mesh itself is not passed to the function `f`, it is the user's
        * responsibility to ensure proper access to the mesh if needed, e.g. by the means of lambda capture
-       * and/or using a \ref SharedPointer.
+       * and/or using a \ref TNL::Pointers::SharedPointer "SharedPointer".
        */
       template< int EntityDimension, typename Device2 = DeviceType, typename Func >
       void forGhost( Func f ) const;
diff --git a/src/TNL/Meshes/MeshEntity.h b/src/TNL/Meshes/MeshEntity.h
index b077ed045..cc55db2af 100644
--- a/src/TNL/Meshes/MeshEntity.h
+++ b/src/TNL/Meshes/MeshEntity.h
@@ -62,11 +62,20 @@ class MeshEntity
       __cuda_callable__
       bool operator!=( const MeshEntity& entity ) const;
 
+      /**
+       * \brief Returns the dimension of this mesh entity.
+       */
       static constexpr int getEntityDimension();
 
+      /**
+       * \brief Returns a reference to the mesh that owns this mesh entity.
+       */
       __cuda_callable__
       const MeshType& getMesh() const;
 
+      /**
+       * \brief Returns the index of this mesh entity.
+       */
       __cuda_callable__
       GlobalIndexType getIndex() const;
 
-- 
GitLab


From 25ff5a6b82bb8f2b65ca67dde12a550ec2925d46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Apr 2021 23:31:53 +0200
Subject: [PATCH 13/13] Simplified implementation of staticFor and unrolledFor

---
 src/TNL/Algorithms/staticFor.h   | 59 ++++++++++++++++----------------
 src/TNL/Algorithms/unrolledFor.h | 44 +++++++++---------------
 2 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/src/TNL/Algorithms/staticFor.h b/src/TNL/Algorithms/staticFor.h
index ebbfe5ae3..2f1b9d1e0 100644
--- a/src/TNL/Algorithms/staticFor.h
+++ b/src/TNL/Algorithms/staticFor.h
@@ -17,6 +17,13 @@ namespace TNL {
 namespace Algorithms {
 
 namespace detail {
+
+// special dispatch for `begin >= end` (i.e. empty loop)
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+static_for_dispatch( Func &&f )
+{}
+
 #if __cplusplus >= 201703L
 
 // C++17 version using fold expression
@@ -26,49 +33,43 @@ constexpr void static_for_impl( Func &&f, std::integer_sequence< Index, idx... >
    ( f( std::integral_constant<Index, begin + idx>{} ), ... );
 }
 
-#else
-
-// C++14 version using recursion and variadic pack
-template< typename Index, Index begin,  typename Func, Index idx >
-constexpr void static_for_impl( Func &&f, std::integer_sequence< Index, idx > )
-{
-   f( std::integral_constant<Index, begin + idx>{} );
-}
-
-template< typename Index, Index begin,  typename Func, Index idx, Index... indices >
-// WTF why, clang, why...
-//constexpr void
-constexpr std::enable_if_t< sizeof...(indices) >= 1 >
-static_for_impl( Func &&f, std::integer_sequence< Index, idx, indices... > )
+// general dispatch for `begin < end`
+template< typename Index, Index begin, Index end,  typename Func >
+constexpr std::enable_if_t< (begin < end) >
+static_for_dispatch( Func &&f )
 {
    static_for_impl< Index, begin >(
          std::forward< Func >( f ),
-         std::integer_sequence< Index, idx >{}
-   );
-   static_for_impl< Index, begin >(
-         std::forward< Func >( f ),
-         std::integer_sequence< Index, indices... >{}
+         std::make_integer_sequence< Index, end - begin >{}
    );
 }
 
-#endif
+#else
 
-// general specialization for `begin < end`
+// C++14 version using recursive folding
+// (We avoid manual folding with std::integer_sequence, because it cannot be
+// empty, so it would be rather weird. Folding is done by bisection to limit
+// the recursion depth.)
+
+// special dispatch for 1 iteration
 template< typename Index, Index begin, Index end,  typename Func >
-constexpr std::enable_if_t< (begin < end) >
+constexpr std::enable_if_t< (begin < end && end - begin == 1) >
 static_for_dispatch( Func &&f )
 {
-   static_for_impl< Index, begin >(
-         std::forward< Func >( f ),
-         std::make_integer_sequence< Index, end - begin >{}
-   );
+   f( std::integral_constant< Index, begin >{} );
 }
 
-// specialization for `begin >= end` (i.e. empty loop)
+// general dispatch for at least 2 iterations
 template< typename Index, Index begin, Index end,  typename Func >
-constexpr std::enable_if_t< (begin >= end) >
+constexpr std::enable_if_t< (begin < end && end - begin >= 2) >
 static_for_dispatch( Func &&f )
-{}
+{
+   constexpr Index mid = begin + (end - begin) / 2;
+   static_for_dispatch< Index, begin, mid >( std::forward< Func >( f ) );
+   static_for_dispatch< Index, mid, end >( std::forward< Func >( f ) );
+}
+
+#endif
 
 } // namespace detail
 
diff --git a/src/TNL/Algorithms/unrolledFor.h b/src/TNL/Algorithms/unrolledFor.h
index 7a5477f52..1e16e402c 100644
--- a/src/TNL/Algorithms/unrolledFor.h
+++ b/src/TNL/Algorithms/unrolledFor.h
@@ -17,49 +17,39 @@ namespace Algorithms {
 
 namespace detail {
 
-template< typename Index, Index begin, Index end >
-struct UnrolledFor
-{
-   static_assert( begin < end, "internal error - wrong iteration index for UnrolledFor" );
-
-   template< typename Func >
-   static constexpr void exec( Func&& f )
-   {
-      f( begin );
-      UnrolledFor< Index, begin + 1, end >::exec( std::forward< Func >( f ) );
-   }
-};
+// special dispatch for empty loop
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin >= end) >
+unrolled_for_dispatch( Func&& f )
+{}
 
-template< typename Index, Index end >
-struct UnrolledFor< Index, end, end >
+// special dispatch for 1 iteration
+template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
+constexpr std::enable_if_t< (begin < end && end - begin == 1) >
+unrolled_for_dispatch( Func&& f )
 {
-   template< typename Func >
-   static constexpr void exec( Func&& f ) {}
-};
+   f( begin );
+}
 
-// specialization for short loops - unrolling
+// specialization for unrolling short loops (at least 2, but at most unrollFactor iterations)
 template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
-constexpr std::enable_if_t< (begin < end && end - begin <= unrollFactor) >
+constexpr std::enable_if_t< (begin < end && end - begin >= 2 && end - begin <= unrollFactor) >
 unrolled_for_dispatch( Func&& f )
 {
-   UnrolledFor< Index, begin, end >::exec( std::forward< Func >( f ) );
+   constexpr Index mid = begin + (end - begin) / 2;
+   unrolled_for_dispatch< Index, begin, mid, unrollFactor >( std::forward< Func >( f ) );
+   unrolled_for_dispatch< Index, mid, end, unrollFactor >( std::forward< Func >( f ) );
 }
 
 // specialization for long loops - normal for-loop
 template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
-constexpr std::enable_if_t< (begin < end && end - begin > unrollFactor) >
+constexpr std::enable_if_t< (begin < end && end - begin > 1 && end - begin > unrollFactor) >
 unrolled_for_dispatch( Func&& f )
 {
    for( Index i = begin; i < end; i++ )
       f( i );
 }
 
-// specialization for empty loop
-template< typename Index, Index begin, Index end, Index unrollFactor,  typename Func >
-constexpr std::enable_if_t< (begin >= end) >
-unrolled_for_dispatch( Func&& f )
-{}
-
 } // namespace detail
 
 /**
-- 
GitLab